module Common
= struct
include Nonstd
module String = struct
include Sosa.Native_string
end
let (//) = Filename.concat
let debug_mode =
ref (try Sys.getenv "BIOKEPI_DEBUG" = "true" with _ -> false)
let dbg fmt = ksprintf (fun s ->
if !debug_mode
then eprintf "biokepi-debug: %s\n%!" s
else ()
) fmt
let failwithf fmt = ksprintf failwith fmt
module Unique_id = struct
include Ketrew_pure.Internal_pervasives.Unique_id
end
module Name_file = struct
let db : (string, string list) Hashtbl.t = Hashtbl.create 42
let path ~readable_suffix ~from high_level_components =
let sanitize =
String.map
~f:(function
| ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-') as c -> c
| other -> '_') in
let components =
begin match from with
| `Path p ->
let b = Filename.basename p in
(try [Filename.chop_extension b] with _ -> [b])
| `In_dir d -> []
end
@
List.map high_level_components ~f:sanitize
in
let hash =
String.concat ~sep:"-" (readable_suffix :: components)
|> Digest.string |> Digest.to_hex
in
let max_length = 220 in
let buf = Buffer.create max_length in
Buffer.add_string buf hash;
let rec append_components =
function
| [] -> ()
| one :: more ->
if
Buffer.length buf + String.length readable_suffix
+ String.length one < max_length
then (Buffer.add_string buf one; append_components more)
else ()
in
append_components components;
Buffer.add_string buf readable_suffix;
let name = Buffer.contents buf in
begin if String.length name > max_length then
ksprintf failwith "Name_file: filename too long %s (max: %d)"
name max_length
end;
begin match Hashtbl.find db name with
| some
when List.sort ~cmp:String.compare some
= List.sort ~cmp:String.compare components -> ()
| some ->
ksprintf failwith "Duplicate filename for different components\nFilename: %s\nPrevious: [%s]\nNew: [%s]\n"
name (String.concat ~sep:", " some) (String.concat ~sep:", " components)
| exception _ ->
Hashtbl.add db name components
end;
begin match from with
| `In_dir s -> s // name
| `Path p -> Filename.dirname p // name
end
let from_path ~readable_suffix p c =
path ~readable_suffix ~from:(`Path p) c
let in_directory ~readable_suffix p c =
path ~readable_suffix ~from:(`In_dir p) c
end
module KEDSL = struct
include Ketrew.EDSL
module Command = Ketrew_pure.Target.Command
type nothing = < is_done : Condition.t option >
let nothing = object method is_done = None end
let target _ = `Please_KEDSL_workflow
let file_target _ = `Please_KEDSL_workflow
type file_workflow = single_file workflow_node
type phony_workflow = nothing workflow_node
type fastq_reads = <
is_done: Ketrew_pure.Target.Condition.t option;
paths : string * (string option);
r1 : single_file;
r2 : single_file option;
sample_name: string;
escaped_sample_name: string;
fragment_id: string option;
fragment_id_forced: string;
>
let fastq_reads ?host ?name ?fragment_id r1 r2_opt : fastq_reads =
object (self)
val r1_file = single_file ?host r1
val r2_file_opt = Option.map r2_opt ~f:(single_file ?host)
method r1 = r1_file
method r2 = r2_file_opt
method paths = (r1, r2_opt)
method is_done =
Some (match r2_file_opt with
| Some r2 -> `And [r1_file#exists; r2#exists]
| None -> `And [r1_file#exists; r1_file#exists;])
method sample_name =
Option.value name ~default:(Filename.basename r1)
method fragment_id = fragment_id
method fragment_id_forced =
Option.value fragment_id ~default:(Filename.basename r1)
method escaped_sample_name =
String.map self#sample_name ~f:(function
| '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' | '-' | '_' as c -> c
| other -> '_')
end
let transform_fastq_reads
?name ?fragment_id
(fq_reads: fastq_reads) r1 r2_opt
: fastq_reads
=
fastq_reads
~host:fq_reads#r1#host
~name:(match name with Some n -> n | None -> fq_reads#sample_name)
?fragment_id:(
match fragment_id with
| Some fi -> fi
| None -> fq_reads#fragment_id)
r1 r2_opt
let read_1_file_node (fq : fastq_reads workflow_node) =
let product = fq#product#r1 in
workflow_node product
~name:(sprintf "READ1 of %s-%s"
fq#product#sample_name
fq#product#fragment_id_forced)
~equivalence:`None
~edges:[depends_on fq]
let read_2_file_node (fq : fastq_reads workflow_node) =
Option.map fq#product#r2 ~f:(fun product ->
workflow_node product
~name:(sprintf "READ2 of %s-%s"
fq#product#sample_name
fq#product#fragment_id_forced)
~equivalence:`None
~edges:[depends_on fq]
)
let fastq_node_of_single_file_nodes
~host ~name ?fragment_id fastq_r1 fastq_r2 =
let product =
let r2 = Option.map fastq_r2 ~f:(fun r -> r#product#path) in
fastq_reads ~host ~name ?fragment_id fastq_r1#product#path r2
in
let edges =
match fastq_r2 with
| Some r2 -> [depends_on fastq_r1; depends_on r2]
| None -> [depends_on fastq_r1]
in
workflow_node product
~equivalence:`None
~name:(sprintf "Assembled-fastq: %s (%s)"
name (Option.value fragment_id
~default:(Filename.basename fastq_r1#product#path)))
~edges
let transform_single_file ~path f =
single_file ~host:f#host path
type bam_file = <
is_done: Ketrew_pure.Target.Condition.t option;
host: Host.t;
path : string;
sample_name: string;
escaped_sample_name: string;
sorting: [ `Coordinate | `Read_name ] option;
reference_build: string;
>
let bam_file ~host ?name ?sorting ~reference_build path : bam_file =
object (self)
val file = single_file ~host path
method host = host
method sample_name =
Option.value name ~default:(Filename.chop_extension (Filename.basename path))
method escaped_sample_name =
String.map self#sample_name ~f:(function
| '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' | '-' | '_' as c -> c
| other -> '_')
method path = file#path
method is_done = file#is_done
method sorting = sorting
method reference_build = reference_build
end
let transform_bam ?change_sorting (bam : bam_file) ~path : bam_file =
bam_file
~host:bam#host
?sorting:(
match change_sorting with
| Some new_sorting -> Some new_sorting
| None -> bam#sorting
)
~reference_build:bam#reference_build
path
type bam_list = <
is_done: Ketrew_pure.Target.Condition.t option;
bams: bam_file list;
>
let bam_list (bams : bam_file list) : bam_list =
object
method bams = bams
method is_done =
Some (
`And (List.map bams
~f:(fun b ->
b#is_done
|> Option.value_exn ~msg:"Bams should have a Condition.t"))
)
end
let explode_bam_list_node (bln : bam_list workflow_node) =
List.map bln#product#bams ~f:(fun bam ->
workflow_node bam
~name:(Filename.basename bam#path)
~tags:["expolode_bam_list_node"]
~edges:[depends_on bln]
~equivalence:`None)
type _ bam_or_bams =
| Single_bam: bam_file workflow_node -> bam_file workflow_node bam_or_bams
| Bam_workflow_list: bam_file workflow_node list -> bam_list workflow_node bam_or_bams
type vcf_file = <
is_done: Ketrew_pure.Target.Condition.t option;
host: Host.t;
path : string;
reference_build: string;
as_single_file: single_file product;
>
let vcf_file ~host ~reference_build path : vcf_file =
object (self)
val file = single_file ~host path
method host = host
method path = file#path
method is_done = file#is_done
method reference_build = reference_build
method as_single_file = file
end
let transform_vcf vcf ~path =
vcf_file ~host:vcf#host ~reference_build:vcf#reference_build path
let submit w = Ketrew.Client.submit_workflow w
end
module Target_tags = struct
let aligner = "aligner"
let variant_caller = "variant-caller"
let clean_up = "clean-up"
end
end
module Hla_utilities
= struct
open Common
type predictor_type = [
| `NetMHC
| `NetMHCpan
| `NetMHCIIpan
| `NetMHCcons
| `Random
| `SMM
| `SMM_PMBEC
| `NetMHCpan_IEDB
| `NetMHCcons_IEDB
| `SMM_IEDB
| `SMM_PMBEC_IEDB
]
let predictor_to_string = function
| `NetMHC -> "netmhc"
| `NetMHCpan -> "netmhcpan"
| `NetMHCIIpan -> "netmhciipan"
| `NetMHCcons -> "netmhccons"
| `Random -> "random"
| `SMM -> "smm"
| `SMM_PMBEC -> "smm-pmbec"
| `NetMHCpan_IEDB -> "netmhcpan-iedb"
| `NetMHCcons_IEDB -> "netmhccons-iedb"
| `SMM_IEDB -> "smm-iedb"
| `SMM_PMBEC_IEDB -> "smm-pmbec-iedb"
let predictor_to_tool ~run_with predictor =
let get_tool t =
let tool =
Machine.get_tool
run_with
Machine.Tool.Definition.(create t)
in
let ensure = Machine.Tool.(ensure tool) in
let init = Machine.Tool.(init tool) in
(ensure, init)
in
match predictor with
| `NetMHC -> Some (get_tool "netMHC")
| `NetMHCpan -> Some (get_tool "netMHCpan")
| `NetMHCIIpan -> Some (get_tool "netMHCIIpan")
| `NetMHCcons -> Some (get_tool "netMHCcons")
| _ -> None
let sanitize_hlarp_out_for_mhctools ~run_with ~hlarp_result ~output_path =
let open KEDSL in
let input_path = hlarp_result#product#path in
let name =
sprintf
"Extract and sanitize alleles: %s"
(hlarp_result#render#name)
in
let edges = [ depends_on hlarp_result; ] in
let product = single_file ~host:(Machine.as_host run_with) output_path in
let tmp_path = output_path ^ ".tmp" in
let make = Machine.(
run_program run_with
~requirements:[ `Quick_run; ]
Program.(
shf "cat %s | grep -v '^2' | awk -F , '{ gsub(/^[ \t]+|[ \t]+$/,\"\", $2); print $2}' | tail -n +2 | sed \"s/'//\" > %s && mv %s %s"
input_path tmp_path tmp_path output_path
)
)
in
workflow_node product ~name ~make ~edges
end
module Machine
= struct
open Common
open KEDSL
module Tool = struct
module Definition = struct
type t = {name: string; version: string option}
let create ?version name = {name; version}
let to_opam_name {name; version} =
sprintf "%s.%s" name (Option.value ~default:"NOVERSION" version)
let to_string = to_opam_name
let to_directory_name = to_opam_name
let get_version t = t.version
let get_name t = t.name
end
module Default = struct
open Definition
let bwa = create "bwa" ~version:"0.7.10"
let freebayes = create "freebayes" ~version:"1.1.0"
let sambamba = create "sambamba" ~version:"0.6.5"
let samtools = create "samtools" ~version:"1.4"
let bcftools = create "bcftools" ~version:"1.4"
let vcftools = create "vcftools" ~version:"0.1.12b"
let bedtools = create "bedtools" ~version:"2.23.0"
let somaticsniper = create "somaticsniper" ~version:"1.0.3"
let varscan = create "varscan" ~version:"2.3.5"
let mutect = create "mutect"
let gatk = create "gatk"
let strelka = create "strelka" ~version:"1.0.14"
let virmid = create "virmid" ~version:"1.1.1"
let muse = create "muse" ~version:"1.0b"
let star = create "star" ~version:"2.4.1d"
let stringtie = create "stringtie" ~version:"1.2.2"
let cufflinks = create "cufflinks" ~version:"2.2.1"
let hisat = create "hisat" ~version:"0.1.6-beta"
let hisat2 = create "hisat" ~version:"2.0.2-beta"
let mosaik = create "mosaik" ~version:"2.2.3"
let kallisto = create "kallisto" ~version:"0.42.3"
let bowtie = create "bowtie" ~version:"1.1.2"
let fastqc = create "fastqc" ~version:"0.11.5"
let igvxml = create "igvxml" ~version:"0.1.0"
let hlarp = create "hlarp" ~version:"biokepi-branch"
let samblaster = create "samblaster" ~version:"v.0.1.22"
let delly2 = create "delly2" ~version:"0.7.7"
let optitype = create "optitype" ~version:"1.2.1-0"
let seqtk = create "seqtk" ~version:"1.2"
let seq2hla = create "seq2hla" ~version:"2.2"
let picard = create "picard" ~version:"2.9.2"
let snpeff = create "snpeff" ~version:"4.3.1m-0"
let pyensembl = create "pyensembl" ~version:"1.1.0"
let vcfannotatepolyphen = create "vcf-annotate-polyphen" ~version:"0.1.2"
let topiary = create "topiary" ~version:"1.2.1"
let vaxrank = create "vaxrank" ~version:"0.6.0"
let isovar = create "isovar" ~version:"0.7.0"
end
type t = {
definition: Definition.t;
init: Program.t;
ensure: phony_workflow;
}
let create ?init ?ensure definition = {
definition;
init =
Option.value init
~default:(Program.shf "echo 'Tool %s: default init'"
(Definition.to_string definition));
ensure =
Option.value_map
ensure
~f:KEDSL.forget_product
~default:(workflow_node nothing
~name:(sprintf "%s-ensured"
(Definition.to_string definition)));
}
let init t = t.init
let ensure t = t.ensure
module Kit = struct
type tool = t
type t = Definition.t -> tool option
let concat : t list -> t =
fun l ->
fun def ->
List.find_map l ~f:(fun kit -> kit def)
let of_list l : t =
fun def ->
List.find l ~f:(fun {definition; _} -> definition = def)
let get_exn t tool =
match t tool with
| Some s -> s
| None ->
failwithf "Toolkit cannot provide the tool %s"
(Definition.to_string tool)
end
end
module Make_fun = struct
module Requirement = struct
type t = [
| `Processors of int
| `Internet_access
| `Memory of [
| `GB of float
| `Small
| `Big
]
| `Quick_run
| `Spark of string list
| `Custom of string
| `Self_identification of string list
] [@@deriving yojson, show]
end
type t =
?name: string ->
?requirements: Requirement.t list ->
Program.t ->
KEDSL.Build_process.t
let stream_processor requirements =
`Processors 1 :: `Memory `Small :: requirements
let quick requirements = `Quick_run :: requirements
let downloading requirements =
`Internet_access :: stream_processor requirements
let with_self_ids ?self_ids l =
match self_ids with
| Some tags -> `Self_identification tags :: l
| None -> l
let with_requirements : t -> Requirement.t list -> t = fun f l ->
fun ?name ?(requirements = []) prog ->
f ?name ~requirements:(l @ requirements) prog
end
type t = {
name: string;
host: Host.t;
pyensembl_cache_dir: string option;
get_reference_genome: string -> Reference_genome.t;
toolkit: Tool.Kit.t;
run_program: Make_fun.t;
work_dir: string;
max_processors: int;
}
let create
~host ?pyensembl_cache_dir ~get_reference_genome ~toolkit
~run_program ~work_dir ~max_processors name =
{name; toolkit; pyensembl_cache_dir; get_reference_genome;
host; run_program; work_dir; max_processors}
let name t = t.name
let as_host ?with_shell t =
match with_shell with
| None -> t.host
| Some shell ->
begin
let open Ketrew_pure in
let shell_key = "shell" in
let org_uri = Host.to_uri t.host in
let uri_no_shell = Uri.remove_query_param org_uri shell_key in
let uri_with_shell =
let shell_str = sprintf "%s,-c" shell in
Uri.add_query_param uri_no_shell (shell_key, [shell_str;])
in
KEDSL.Host.parse (Uri.to_string uri_with_shell)
end
let get_pyensembl_cache_dir t = t.pyensembl_cache_dir
let get_reference_genome t = t.get_reference_genome
let get_tool t tool =
match t.toolkit tool with
| Some s -> s
| None ->
failwithf "Machine %S cannot provide the tool %s"
t.name (Tool.Definition.to_string tool)
let run_program t = t.run_program
let max_processors t = t.max_processors
let quick_run_program t : Make_fun.t =
Make_fun.with_requirements t.run_program (Make_fun.quick [])
let run_stream_processor ?self_ids t : Make_fun.t =
Make_fun.with_requirements t.run_program
(Make_fun.stream_processor [] |> Make_fun.with_self_ids ?self_ids)
let run_download_program t : Make_fun.t =
Make_fun.with_requirements t.run_program (Make_fun.downloading [])
let run_big_program t :
?processors: int -> ?self_ids : string list -> Make_fun.t =
fun ?(processors = 1) ?self_ids ->
Make_fun.with_requirements
t.run_program
(Make_fun.with_self_ids ?self_ids [`Memory `Big; `Processors processors])
let work_dir t = t.work_dir
end
module Metadata
= struct
let version = lazy "0.0.0+dev"
let git_commit = Some "c07982e85f581fc4972b5a6f8d601649e56a4f34"
let git_description = Some "biokepi.0.0.0-794-gc07982e"
end
module Reference_genome
: sig
open Common
type name = string
module Specification : sig
module Location : sig
type t = [
| `Url of string
| `Vcf_concat of (string * t) list
| `Concat of t list
| `Gunzip of t
| `Bunzip2 of t
| `Untar of t
]
val url : 'a -> [> `Url of 'a ]
val vcf_concat : 'a -> [> `Vcf_concat of 'a ]
val concat : 'a -> [> `Concat of 'a ]
val gunzip : 'a -> [> `Gunzip of 'a ]
val bunzip2 : 'a -> [> `Bunzip2 of 'a ]
val untar : 'a -> [> `Untar of 'a ]
end
type t = private {
name : name;
ensembl : int;
species : string;
metadata : string option;
fasta : Location.t;
dbsnp : Location.t option;
known_indels : Location.t option;
cosmic : Location.t option;
exome_gtf : Location.t option;
cdna : Location.t option;
whess : Location.t option;
major_contigs : string list option;
snpeff_name : string option;
}
val create :
?metadata:string ->
fasta:Location.t ->
ensembl:int ->
species:string ->
?dbsnp:Location.t ->
?known_indels:Location.t ->
?cosmic:Location.t ->
?exome_gtf:Location.t ->
?cdna:Location.t ->
?whess:Location.t ->
?major_contigs:string list ->
?snpeff_name:string ->
string ->
t
module Default :
sig
module Name : sig
val b37 : name
val b37decoy : name
val b38 : name
val hg38: name
val hg18 : name
val hg19 : name
val mm10 : name
end
val b37 : t
val b37decoy : t
val b38 : t
val hg38 : t
val hg18 : t
val hg19 : t
val mm10 : t
end
end
type t = private {
specification: Specification.t;
location : KEDSL.file_workflow;
cosmic : KEDSL.file_workflow option;
dbsnp : KEDSL.file_workflow option;
known_indels : KEDSL.file_workflow option;
gtf : KEDSL.file_workflow option;
cdna : KEDSL.file_workflow option;
whess : KEDSL.file_workflow option;
}
val create :
?cosmic:KEDSL.file_workflow ->
?dbsnp:KEDSL.file_workflow ->
?known_indels:KEDSL.file_workflow ->
?gtf:KEDSL.file_workflow ->
?cdna:KEDSL.file_workflow ->
?whess:KEDSL.file_workflow ->
Specification.t -> KEDSL.file_workflow -> t
val name : t -> name
val ensembl : t -> int
val species : t -> string
val path : t -> string
val cosmic_path_exn : t -> string
val dbsnp_path_exn : t -> string
val known_indels_path_exn : t -> string
val gtf_path_exn : t -> string
val cdna_path_exn : t -> string
val whess_path_exn : t -> string
val snpeff_name_exn: t -> string
val major_contigs : t -> Region.t list
val fasta: t -> KEDSL.file_workflow
val cosmic_exn: t -> KEDSL.file_workflow
val dbsnp_exn: t -> KEDSL.file_workflow
val known_indels_exn: t -> KEDSL.file_workflow
val gtf_exn: t -> KEDSL.file_workflow
val gtf: t -> KEDSL.file_workflow option
val cdna_exn: t -> KEDSL.file_workflow
val whess_exn: t -> KEDSL.file_workflow
end
= struct
open Common
type name = string
module Specification = struct
module Location = struct
type t = [
| `Url of string
| `Vcf_concat of (string * t) list
| `Concat of t list
| `Gunzip of t
| `Bunzip2 of t
| `Untar of t
]
let url u = `Url u
let vcf_concat l = `Vcf_concat l
let concat l = `Concat l
let gunzip l = `Gunzip l
let bunzip2 l = `Bunzip2 l
let untar l = `Untar l
end
type t = {
name: string;
ensembl: int;
species: string;
metadata: string option;
fasta: Location.t;
dbsnp: Location.t option;
known_indels: Location.t option;
cosmic: Location.t option;
exome_gtf: Location.t option;
cdna: Location.t option;
whess: Location.t option;
major_contigs: string list option;
snpeff_name: string option;
}
let create
?metadata
~fasta
~ensembl
~species
?dbsnp
?known_indels
?cosmic
?exome_gtf
?cdna
?whess
?major_contigs
?snpeff_name
name = {
name;
ensembl;
species;
metadata;
fasta;
dbsnp;
known_indels;
cosmic;
exome_gtf;
cdna;
whess;
major_contigs;
snpeff_name;
}
module Default = struct
let major_contigs_b37 =
List.init 22 (fun i -> sprintf "%d" (i + 1))
@ ["X"; "Y"; "MT";]
let major_contigs_hg_family =
List.init 22 (fun i -> sprintf "chr%d" (i + 1))
@ [
"chrX";
"chrY";
"chrM";
]
let major_contigs_mm10 =
List.init 19 (fun i -> sprintf "%d" (i + 1))
@ [ "X"; "Y" ]
module Name = struct
let b37 = "b37"
let b37decoy = "b37decoy"
let b38 = "b38"
let hg38 = "hg38"
let hg18 = "hg18"
let hg19 = "hg19"
let mm10 = "mm10"
end
let b37_dbsnp_url =
"https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/dbsnp_138.b37.vcf.gz"
let b37_cosmic_url =
"http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/b37_cosmic_v54_120711.vcf"
let b37_exome_gtf_url =
"http://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz"
let b37_cdna_url =
"http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
let b37_whess_url =
"ftp://genetics.bwh.harvard.edu/pph2/whess/polyphen-2.2.2-whess-2011_12.sqlite.bz2"
let b37_known_indels_url =
"https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/Mills_and_1000G_gold_standard.indels.b37.vcf.gz"
let human = "homo sapiens"
let mouse = "mus musculus"
let b37 =
create Name.b37
~species:human
~ensembl:75
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(
url "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/human_g1k_v37.fasta.gz"
|> gunzip)
~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
~known_indels:Location.(url b37_known_indels_url |> gunzip)
~cosmic:Location.(url b37_cosmic_url)
~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
~cdna:Location.(url b37_cdna_url |> gunzip)
~whess:Location.(url b37_whess_url |> bunzip2)
~snpeff_name:"GRCh37.75"
let b37decoy =
create Name.b37decoy
~species:human
~ensembl:75
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(
url
"ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
|> gunzip)
~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
~known_indels:Location.(url b37_known_indels_url |> gunzip)
~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
~cosmic:Location.(url b37_cosmic_url)
~cdna:Location.(url b37_cdna_url |> gunzip)
~whess:Location.(url b37_whess_url |> bunzip2)
~snpeff_name:"GRCh37.75"
let hg38 =
let hg38_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.fasta.gz" in
let dbsnp_hg38 =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.dbsnp.vcf.gz" in
let known_indels_hg38 =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.known_indels.vcf.gz" in
create Name.hg38
~species:human
~ensembl:87
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg38_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg38 |> gunzip)
~known_indels:Location.(url known_indels_hg38 |> gunzip)
~snpeff_name:"GRCh38.86"
let b38 =
let b38_url =
"http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" in
let gtf_b38_url =
"http://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz" in
let cdna_b38_url =
"http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" in
let dbsnp_url =
"http://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/common_all_20170710.vcf.gz" in
create Name.b38
~species:human
~ensembl:87
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(url b38_url |> gunzip)
~exome_gtf:Location.(url gtf_b38_url |> gunzip)
~dbsnp:Location.(url dbsnp_url |> gunzip)
~cdna:Location.(url cdna_b38_url |> gunzip)
~snpeff_name:"GRCh38.86"
let hg18 =
let hg18_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/Homo_sapiens_assembly18.fasta.gz" in
let dbsnp_hg18_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/dbsnp_138.hg18.vcf.gz" in
create Name.hg18
~ensembl:54
~species:human
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg18_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg18_url |> gunzip)
let hg19 =
let hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/ucsc.hg19.fasta.gz" in
let dbsnp_hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/dbsnp_138.hg19.vcf.gz" in
let known_indels_hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" in
create Name.hg19
~ensembl:75
~species:human
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg19_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg19_url |> gunzip)
~known_indels:Location.(url known_indels_hg19_url |> gunzip)
~whess:Location.(url b37_whess_url |> bunzip2)
~snpeff_name:"hg19"
let mm10 =
let mm10_url =
"https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/mm10.GRCm38.dna_sm.fa" in
let dbsnp_mm10_snps_url =
"ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.snps.rsIDdbSNPv137.vcf.gz" in
let dbsnp_mm10_indels_url =
"ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.indels.rsIDdbSNPv137.vcf.gz" in
let gene_annotations_gtf =
"ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz" in
let cdna_mm10_url =
"ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" in
create Name.mm10
~ensembl:87
~species:mouse
~metadata:"Provided by the Biokepi Library"
~major_contigs:major_contigs_mm10
~fasta:Location.(url mm10_url |> gunzip)
~dbsnp:Location.(
vcf_concat ["db_snps.vcf", url dbsnp_mm10_snps_url |> gunzip;
"db_indels.vcf", url dbsnp_mm10_indels_url |> gunzip]
)
~exome_gtf:Location.(url gene_annotations_gtf |> gunzip)
~cdna:Location.(url cdna_mm10_url |> gunzip)
~snpeff_name:"mm10"
end
end
type t = {
specification: Specification.t;
location: KEDSL.file_workflow;
cosmic: KEDSL.file_workflow option;
dbsnp: KEDSL.file_workflow option;
known_indels: KEDSL.file_workflow option;
gtf: KEDSL.file_workflow option;
cdna: KEDSL.file_workflow option;
whess: KEDSL.file_workflow option;
}
let create ?cosmic ?dbsnp ?known_indels ?gtf ?cdna ?whess specification location =
{specification; location; cosmic; dbsnp; known_indels; gtf; cdna; whess}
let name t = t.specification.Specification.name
let ensembl t = t.specification.Specification.ensembl
let species t = t.specification.Specification.species
let snpeff_name_exn t =
Option.value_exn
~msg:(sprintf "%s: no snpEff name" (name t))
t.specification.Specification.snpeff_name
let path t = t.location#product#path
let cosmic_path_exn t =
let msg = sprintf "cosmic_path_exn of %s" (name t) in
let cosmic = Option.value_exn ~msg t.cosmic in
cosmic#product#path
let dbsnp_path_exn t =
let msg = sprintf "dbsnp_path_exn of %s" (name t) in
let trgt = Option.value_exn ~msg t.dbsnp in
trgt#product#path
let known_indels_path_exn t =
let msg = sprintf "known_indels_path_exn of %s" (name t) in
let trgt = Option.value_exn ~msg t.known_indels in
trgt#product#path
let gtf_path_exn t =
let msg = sprintf "gtf_path_exn of %s" (name t) in
let trgt = Option.value_exn ~msg t.gtf in
trgt#product#path
let cdna_path_exn t =
let msg = sprintf "cdna_path_exn of %s" (name t) in
let target = Option.value_exn ~msg t.cdna in
target#product#path
let whess_path_exn t =
let msg = sprintf "whess_path_exn of %s" (name t) in
let target = Option.value_exn ~msg t.whess in
target#product#path
let fasta: t -> KEDSL.file_workflow = fun t -> t.location
let cosmic_exn t =
Option.value_exn ~msg:(sprintf "%s: no COSMIC" (name t)) t.cosmic
let dbsnp_exn t =
Option.value_exn ~msg:(sprintf "%s: no DBSNP" (name t)) t.dbsnp
let known_indels_exn t =
Option.value_exn ~msg:(sprintf "%s: no Known Indels" (name t)) t.known_indels
let gtf_exn t =
Option.value_exn ~msg:(sprintf "%s: no GTF" (name t)) t.gtf
let gtf t = t.gtf
let cdna_exn t =
Option.value_exn ~msg:(sprintf "%s: no cDNA fasta file" (name t)) t.cdna
let whess_exn t =
Option.value_exn ~msg:(sprintf "%s: no WHESS file" (name t)) t.whess
let major_contigs t : Region.t list =
match t.specification.Specification.major_contigs with
| None ->
failwithf "Reference %S does have major-contigs/chromosomes defined" (name t)
| Some l -> List.map l ~f:(fun s -> `Chromosome s)
end
module Region
= struct
open Common
type t = [
| `Chromosome of string
| `Chromosome_interval of string * int * int
| `Full
]
let to_filename = function
| `Full -> "Full"
| `Chromosome s -> sprintf "%s" s
| `Chromosome_interval (s, b, e) -> sprintf "%s_%d-%d" s b e
let to_samtools_specification = function
| `Full -> None
| `Chromosome s -> Some s
| `Chromosome_interval (s, b, e) -> Some (sprintf "%s:%d-%d" s b e)
let to_samtools_option r =
match to_samtools_specification r with
| Some s -> sprintf "-r %s" s
| None -> ""
let to_gatk_option r =
match to_samtools_specification r with
| Some s -> sprintf "--intervals %s" s
| None -> ""
let parse_samtools s =
match String.split ~on:(`Character ':') s with
| [] -> assert false
| [one] -> `Chromosome one
| [one; two] ->
begin match String.split ~on:(`Character '-') two with
| [left; right] ->
begin match Int.of_string left, Int.of_string right with
| Some b, Some e -> `Chromosome_interval (one, b, e)
| _ -> failwithf "Cannot parse %S into 2 loci" two
end
| _ -> failwithf "Not one '-' in %S" two
end
| _ -> failwithf "Not one or zero ':' in %S" s
let cmdliner_term () =
let open Cmdliner in
Term.(
pure (function
| None -> `Full
| Some s -> parse_samtools s)
$ Arg.(
value & opt (some string) None
& info ["R"; "region"] ~docv:"REGION"
~doc:"Specify a region; using samtools' format"
)
)
end
module Tool_parameters
= struct
open Common
type t = {
name: string;
parameters: (string * string) list;
}
let to_json t: Yojson.Basic.json =
let {name; parameters} = t in
`Assoc [
"name", `String name;
"parameters",
`Assoc (List.map parameters ~f:(fun (a, b) -> a, `String b));
]
let render {parameters; _} =
List.concat_map parameters ~f:(fun (a,b) -> [a; b])
end
module Workflow_utilities
= struct
open Common
module Remove = struct
let file ~run_with path =
let open KEDSL in
workflow_node nothing
~name:(sprintf "rm-%s" (Filename.basename path))
~ensures:(`Is_verified (`Command_returns (
Command.shell ~host:Machine.(as_host run_with)
(sprintf "ls %s" path),
2)))
~make:(Machine.quick_run_program
run_with Program.(exec ["rm"; "-f"; path]))
~tags:[Target_tags.clean_up]
let directory ~run_with path =
let open KEDSL in
workflow_node nothing
~name:(sprintf "rmdir-%s" (Filename.basename path))
~ensures:(`Is_verified (`Command_returns (
Command.shell ~host:Machine.(as_host run_with)
(sprintf "ls %s" path),
2
)))
~make:(Machine.quick_run_program
run_with Program.(exec ["rm"; "-rf"; path]))
~tags:[Target_tags.clean_up]
let path_on_host ~host path =
let open KEDSL in
workflow_node nothing
~name:(sprintf "rm-%s" (Filename.basename path))
~make:(daemonize ~using:`Python_daemon ~host
Program.(exec ["rm"; "-rf"; path]))
end
module Gunzip = struct
let concat ~(run_with : Machine.t) bunch_of_dot_gzs ~result_path =
let open KEDSL in
let program =
Program.(
exec ["mkdir"; "-p"; Filename.dirname result_path]
&& shf "gunzip -c %s > %s"
(List.map bunch_of_dot_gzs
~f:(fun o -> Filename.quote o#product#path)
|> String.concat ~sep:" ") result_path
) in
let name =
sprintf "gunzipcat-%s" (Filename.basename result_path) in
workflow_node
(single_file result_path ~host:Machine.(as_host run_with))
~name
~make:(Machine.run_stream_processor ~name run_with program)
~edges:(
on_failure_activate Remove.(file ~run_with result_path)
:: List.map ~f:depends_on bunch_of_dot_gzs)
end
module Cat = struct
let concat ~(run_with : Machine.t) bunch_of_files ~result_path =
let open KEDSL in
let program =
Program.(
exec ["mkdir"; "-p"; Filename.dirname result_path]
&& shf "cat %s > %s"
(List.map bunch_of_files
~f:(fun o -> Filename.quote o#product#path)
|> String.concat ~sep:" ") result_path
) in
let name =
sprintf "concat-all-%s" (Filename.basename result_path) in
workflow_node
(single_file result_path ~host:Machine.(as_host run_with))
~name
~edges:(
on_failure_activate Remove.(file ~run_with result_path)
:: List.map ~f:depends_on bunch_of_files)
~make:(Machine.run_stream_processor run_with ~name program)
let cat_folder ~host
~(run_program : Machine.Make_fun.t)
?(depends_on=[]) ~files_gzipped ~folder ~destination =
let deps = depends_on in
let open KEDSL in
let name = "cat-folder-" ^ Filename.quote folder in
let edges =
on_failure_activate (Remove.path_on_host ~host destination)
:: List.map ~f:depends_on deps in
if files_gzipped then (
workflow_node (single_file destination ~host)
~edges ~name
~make:(
run_program ~name
Program.(
shf "gunzip -c %s/* > %s" (Filename.quote folder)
(Filename.quote destination)))
) else (
workflow_node
(single_file destination ~host)
~edges ~name
~make:(
run_program ~name
Program.(
shf "cat %s/* > %s" (Filename.quote folder) (Filename.quote destination)))
)
end
module Download = struct
let wget_program ?output_filename url =
KEDSL.Program.exec [
"wget";
"-O"; Option.value output_filename ~default:Filename.(basename url);
url
]
let wget_to_folder
~host ~(run_program : Machine.Make_fun.t)
~test_file ~destination url =
let open KEDSL in
let name = "wget-" ^ Filename.basename destination in
let test_target = destination // test_file in
workflow_node (single_file test_target ~host) ~name
~make:(
run_program ~name
~requirements:(Machine.Make_fun.downloading [])
Program.(
exec ["mkdir"; "-p"; destination]
&& shf "wget %s -P %s"
(Filename.quote url)
(Filename.quote destination)))
~edges:[
on_failure_activate (Remove.path_on_host ~host destination);
]
let wget
~host ~(run_program : Machine.Make_fun.t)
url destination =
let open KEDSL in
let name = "wget-" ^ Filename.basename destination in
workflow_node
(single_file destination ~host) ~name
~make:(
run_program ~name
~requirements:(Machine.Make_fun.downloading [])
Program.(
exec ["mkdir"; "-p"; Filename.dirname destination]
&& shf "wget %s -O %s"
(Filename.quote url) (Filename.quote destination)))
~edges:[
on_failure_activate (Remove.path_on_host ~host destination);
]
let wget_gunzip
~host ~(run_program : Machine.Make_fun.t)
~destination url =
let open KEDSL in
let is_gz = Filename.check_suffix url ".gz" in
if is_gz then (
let name = "gunzip-" ^ Filename.basename (destination ^ ".gz") in
let wgot = wget ~host ~run_program url (destination ^ ".gz") in
workflow_node
(single_file destination ~host)
~edges:[
depends_on (wgot);
on_failure_activate (Remove.path_on_host ~host destination);
]
~name
~make:(
run_program ~name
~requirements:(Machine.Make_fun.stream_processor [])
Program.(shf "gunzip -c %s > %s"
(Filename.quote wgot#product#path)
(Filename.quote destination)))
) else (
wget ~host ~run_program url destination
)
let wget_bunzip2
~host ~(run_program : Machine.Make_fun.t)
~destination url =
let open KEDSL in
let is_bz2 = Filename.check_suffix url ".bz2" in
if is_bz2 then (
let name = "bunzip2-" ^ Filename.basename (destination ^ ".bz2") in
let wgot = wget ~host ~run_program url (destination ^ ".bz2") in
workflow_node
(single_file destination ~host)
~edges:[
depends_on (wgot);
on_failure_activate (Remove.path_on_host ~host destination);
]
~name
~make:(
run_program ~name
~requirements:(Machine.Make_fun.stream_processor [])
Program.(shf "bunzip2 -c %s > %s"
(Filename.quote wgot#product#path)
(Filename.quote destination)))
) else (
wget ~host ~run_program url destination
)
let wget_untar
~host ~(run_program : Machine.Make_fun.t)
~destination_folder ~tar_contains url =
let open KEDSL in
let zip_flags =
let is_gz = Filename.check_suffix url ".gz" in
let is_bzip = Filename.check_suffix url ".bz2" in
if is_gz then "z" else if is_bzip then "j" else ""
in
let tar_filename = (destination_folder // "archive.tar") in
let name = "untar-" ^ tar_filename in
let wgot = wget ~host ~run_program url tar_filename in
let file_in_tar = (destination_folder // tar_contains) in
workflow_node
(single_file file_in_tar ~host)
~edges:[
depends_on (wgot);
on_failure_activate (Remove.path_on_host ~host destination_folder);
]
~name
~make:(
run_program ~name
~requirements:(Machine.Make_fun.stream_processor [])
Program.(
exec ["mkdir"; "-p"; destination_folder]
&& shf "tar -x%s -f %s -C %s"
zip_flags
(Filename.quote wgot#product#path)
(Filename.quote destination_folder)))
type tool_file_location = [
| `Scp of string
| `Wget of string
| `Fail of string
]
let get_tool_file
~identifier
~(run_program : Machine.Make_fun.t)
~host ~install_path
loc =
let open KEDSL in
let rm_path = Remove.path_on_host in
let jar_name =
match loc with
| `Fail s -> sprintf "cannot-get-%s.file" identifier
| `Scp s -> Filename.basename s
| `Wget s -> Filename.basename s in
let local_box_path = install_path // jar_name in
workflow_node (single_file local_box_path ~host)
~name:(sprintf "get-%s" jar_name)
~edges:[
on_failure_activate (rm_path ~host local_box_path)
]
~make:(
run_program
~requirements:[
`Internet_access;
`Self_identification [identifier ^ "-instalation"; jar_name];
]
Program.(
shf "mkdir -p %s" install_path
&& begin match loc with
| `Fail msg ->
shf "echo 'Cannot download file for %s: %s'" identifier msg
&& sh "exit 4"
| `Scp s ->
shf "scp %s %s"
(Filename.quote s) (Filename.quote local_box_path)
| `Wget s ->
shf "wget %s -O %s"
(Filename.quote s) (Filename.quote local_box_path)
end))
let gsutil_cp
~(run_program : Machine.Make_fun.t)
~host ~url ~local_path =
let open KEDSL in
workflow_node (single_file ~host local_path)
~name:(sprintf "GSUtil-CP: %s" (Filename.basename local_path))
~edges:[
on_failure_activate (Remove.path_on_host ~host local_path)
]
~make:(
run_program
~requirements:[
`Internet_access;
`Self_identification ["gsutil-cp"; url];
]
Program.(
shf "mkdir -p %s" (Filename.dirname local_path)
&& exec ["gsutil"; "cp"; url; local_path]
)
)
end
module Vcftools = struct
let vcf_process_n_to_1_no_machine
~host
~vcftools
~(run_program : Machine.Make_fun.t)
?(more_edges = [])
~vcfs
~make_product
~final_vcf
command_prefix
=
let open KEDSL in
let name = sprintf "%s-%s" command_prefix (Filename.basename final_vcf) in
let make =
run_program ~name
Program.(
Machine.Tool.(init vcftools)
&& shf "%s %s > %s"
command_prefix
(String.concat ~sep:" "
(List.map vcfs ~f:(fun t -> Filename.quote t#product#path)))
final_vcf
) in
workflow_node ~name
(make_product final_vcf)
~make
~edges:(
on_failure_activate
(Remove.path_on_host ~host final_vcf)
:: depends_on Machine.Tool.(ensure vcftools)
:: List.map ~f:depends_on vcfs
@ more_edges)
let vcf_concat_no_machine
~host
~vcftools
~(run_program : Machine.Make_fun.t)
?more_edges
~make_product
vcfs
~final_vcf =
vcf_process_n_to_1_no_machine
~make_product
~host ~vcftools ~run_program ?more_edges ~vcfs ~final_vcf
"vcf-concat"
let vcf_sort_no_machine
~host
~vcftools
~(run_program : Machine.Make_fun.t)
?more_edges
~make_product
~src ~dest () =
let run_program =
Machine.Make_fun.with_requirements run_program [`Memory `Big] in
vcf_process_n_to_1_no_machine
~make_product
~host ~vcftools ~run_program ?more_edges ~vcfs:[src] ~final_vcf:dest
"vcf-sort -c"
end
module Variable_tool_paths = struct
let single_file ~run_with ~tool path =
let open KEDSL in
let condition =
let init = Machine.Tool.init tool in
let host = Machine.as_host ~with_shell:"bash" run_with in
let condition_cmd =
Ketrew_pure.Program.to_single_shell_command
Program.(init && shf "test -e %s" path)
in KEDSL.Command.shell ~host condition_cmd
in
object
method is_done = Some (`Command_returns (condition, 0))
end
end
end