(* code generated with [./tools/build-doc.sh ketrew,ppx_deriving.std] *) module Common  = struct
(**************************************************************************)
(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *) (*                                                                        *) (*  Licensed under the Apache License, Version 2.0 (the "License");       *) (*  you may not use this file except in compliance with the License.      *) (*  You may obtain a copy of the License at                               *) (*                                                                        *) (*      http://www.apache.org/licenses/LICENSE-2.0                        *) (*                                                                        *) (*  Unless required by applicable law or agreed to in writing, software   *) (*  distributed under the License is distributed on an "AS IS" BASIS,     *) (*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *) (*  implied.  See the License for the specific language governing         *) (*  permissions and limitations under the License.                        *)
(**************************************************************************)
(** Module opened by default (like Pervasives) for our library. *)
(** A Non-standard mini library. *)
include Nonstd
(** A String module with more capabilities *)
module String = struct   include Sosa.Native_string end let (//) = Filename.concat
(** path // filename will concat filename to the end of path. *)
let debug_mode =   ref (try Sys.getenv "BIOKEPI_DEBUG" = "true" with _ -> false) let dbg fmt = ksprintf (fun s ->     if !debug_mode     then eprintf "biokepi-debug: %s\n%!" s     else ()   ) fmt
(** A consistent debugging mechanism. *)
let failwithf fmt = ksprintf failwith fmt
(** A formatted failwith. *)
module Unique_id = struct   include Ketrew_pure.Internal_pervasives.Unique_id end
(**

Generate unique filenames for a given set of uniquely identifying properties.

The module generates names that remain (way) under 255 bytes because they are the most common file-system limits in 2016.

*)

module Name_file = struct   (* Additional safety: we check that there are no hash-duplicates      within a given execution of a program linking with Biokepi. *)   let db : (string, string list) Hashtbl.t = Hashtbl.create 42   let path ~readable_suffix ~from high_level_components =     let sanitize =       String.map         ~f:(function           | ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-'as c -> c           | other -> '_'in     let components =       begin match from with       | `Path p ->         let b = Filename.basename p in         (try [Filename.chop_extension b] with _ -> [b])       | `In_dir d -> []       end       @       List.map high_level_components ~f:sanitize     in     let hash =       String.concat ~sep:"-" (readable_suffix :: components)       |> Digest.string |> Digest.to_hex     in     let max_length = 220 in     let buf = Buffer.create max_length in     Buffer.add_string buf hash;     let rec append_components =       function       | [] -> ()       | one :: more ->         if           Buffer.length buf + String.length readable_suffix           + String.length one < max_length         then (Buffer.add_string buf one; append_components more)         else ()     in     append_components components;     Buffer.add_string buf readable_suffix;     let name = Buffer.contents buf in     begin if String.length name > max_length then         ksprintf failwith "Name_file: filename too long %s (max: %d)"           name max_length     end;     begin match Hashtbl.find db name with     | some       when List.sort ~cmp:String.compare some            = List.sort ~cmp:String.compare components -> ()     | some ->       ksprintf failwith "Duplicate filename for different components\nFilename: %s\nPrevious: [%s]\nNew: [%s]\n"         name (String.concat ~sep:", " some) (String.concat ~sep:", " components)     | exception _ ->       Hashtbl.add db name components     end;     begin match from with     | `In_dir s -> s // name     | `Path p -> Filename.dirname p // name     end   let from_path ~readable_suffix p c =     path ~readable_suffix ~from:(`Path p) c   let in_directory ~readable_suffix p c =     path ~readable_suffix ~from:(`In_dir p) c end
(**

This is an experimental extension of Ketrew's EDSL. If we're happy with it we'll push it upstream.

The idea is carry around a type parameter to have arbitrary products.

*)

module KEDSL = struct   include Ketrew.EDSL   module Command = Ketrew_pure.Target.Command   type nothing = < is_done : Condition.t option >   let nothing  = object method is_done = None end   let target _ = `Please_KEDSL_workflow   let file_target _ = `Please_KEDSL_workflow   type file_workflow = single_file workflow_node   type phony_workflow = nothing workflow_node   type fastq_reads = <     is_done: Ketrew_pure.Target.Condition.t option;     paths : string * (string option);     r1 : single_file;     r2 : single_file option;     sample_name: string;     escaped_sample_name: string;     fragment_id: string option;     fragment_id_forced: string;   >   let fastq_reads ?host ?name ?fragment_id r1 r2_opt : fastq_reads =     object (self)       val r1_file = single_file ?host r1       val r2_file_opt = Option.map r2_opt ~f:(single_file ?host)       method r1 = r1_file         (* workflow_node r1_file           ~name:(sprintf "File: %s" (Filename.basename r1_file#path)) *)       method r2 =  r2_file_opt         (* Option.map r2_file_opt ~f:(fun r2_file ->             workflow_node r2_file               ~name:(sprintf "File: %s" (Filename.basename r2_file#path))           ) *)       method paths = (r1, r2_opt)       method is_done =         Some (match r2_file_opt with           | Some r2 -> `And [r1_file#exists; r2#exists]           | None -> `And [r1_file#exists; r1_file#exists;])       method sample_name =         Option.value name ~default:(Filename.basename r1)       method fragment_id = fragment_id       method fragment_id_forced =         Option.value fragment_id ~default:(Filename.basename r1)       method escaped_sample_name =         String.map self#sample_name ~f:(function           | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' | '-' | '_' as c -> c           | other -> '_')     end   let transform_fastq_reads       ?name ?fragment_id       (fq_reads: fastq_reads) r1 r2_opt     : fastq_reads     =      fastq_reads        ~host:fq_reads#r1#host       ~name:(match name with Some n -> n | None -> fq_reads#sample_name)       ?fragment_id:(         match fragment_id with         | Some fi -> fi         | None -> fq_reads#fragment_id)       r1 r2_opt   let read_1_file_node (fq : fastq_reads workflow_node) =     let product = fq#product#r1 in     workflow_node product       ~name:(sprintf "READ1 of %s-%s"                fq#product#sample_name                fq#product#fragment_id_forced)       ~equivalence:`None       ~edges:[depends_on fq]   let read_2_file_node (fq : fastq_reads workflow_node) =     Option.map fq#product#r2 ~f:(fun product ->         workflow_node product           ~name:(sprintf "READ2 of %s-%s"                    fq#product#sample_name                    fq#product#fragment_id_forced)           ~equivalence:`None           ~edges:[depends_on fq]       )   
  (** Create a fastq_reads workflow_node from one or two single_file workflow_node(s). *)
  let fastq_node_of_single_file_nodes       ~host ~name ?fragment_id fastq_r1 fastq_r2 =     let product =       let r2 = Option.map fastq_r2 ~f:(fun r -> r#product#path) in       fastq_reads ~host ~name ?fragment_id fastq_r1#product#path r2     in     let edges =       match fastq_r2 with       | Some r2 -> [depends_on fastq_r1; depends_on r2]       | None -> [depends_on fastq_r1]     in     workflow_node product       ~equivalence:`None       ~name:(sprintf "Assembled-fastq: %s (%s)"                name (Option.value fragment_id                        ~default:(Filename.basename fastq_r1#product#path)))       ~edges   let transform_single_file ~path f =       single_file ~host:f#host path   type bam_file = <     is_done: Ketrew_pure.Target.Condition.t option;     host: Host.t;     path : string;     sample_name: string;     escaped_sample_name: string;     sorting: [ `Coordinate | `Read_name ] option;     reference_build: string;   >   let bam_file ~host ?name ?sorting ~reference_build path : bam_file =     object (self)       val file = single_file ~host path       method host = host       method sample_name =         Option.value name ~default:(Filename.chop_extension (Filename.basename path))       method escaped_sample_name =         String.map self#sample_name ~f:(function           | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' | '-' | '_' as c -> c           | other -> '_')       method path = file#path       method is_done = file#is_done       method sorting = sorting       method reference_build = reference_build     end   
  (** Make a new bam sharing most of the metadata. *)
  let transform_bam ?change_sorting (bam : bam_file) ~path : bam_file =     bam_file       ~host:bam#host       ?sorting:(         match change_sorting with         | Some new_sorting -> Some new_sorting         | None -> bam#sorting       )       ~reference_build:bam#reference_build       path   type bam_list = <     is_done:  Ketrew_pure.Target.Condition.t option;     bams: bam_file list;   >   let bam_list (bams : bam_file list) : bam_list =     object       method bams = bams       method is_done =         Some (           `And (List.map bams                   ~f:(fun b ->                       b#is_done                       |> Option.value_exn ~msg:"Bams should have a Condition.t"))         )     end   let explode_bam_list_node (bln : bam_list workflow_node) =     List.map bln#product#bams ~f:(fun bam ->         workflow_node bam           ~name:(Filename.basename bam#path)           ~tags:["expolode_bam_list_node"]           ~edges:[depends_on bln]           ~equivalence:`None)   (* this may be overkill: *)   type _ bam_or_bams =     | Single_bam: bam_file workflow_node -> bam_file workflow_node bam_or_bams     | Bam_workflow_list: bam_file workflow_node list -> bam_list workflow_node bam_or_bams   type vcf_file = <     is_done: Ketrew_pure.Target.Condition.t option;     host: Host.t;     path : string;     reference_build: string;     as_single_file: single_file product;   >   let vcf_file ~host ~reference_build path : vcf_file =     object (self)       val file = single_file ~host path       method host = host       method path = file#path       method is_done = file#is_done       method reference_build = reference_build       method as_single_file = file     end   let transform_vcf vcf ~path =     vcf_file ~host:vcf#host ~reference_build:vcf#reference_build path   let submit w = Ketrew.Client.submit_workflow w end
(** An attempt at standardizing “tags.” *)
module Target_tags = struct   let aligner = "aligner"   let variant_caller = "variant-caller"   let clean_up = "clean-up" end end module Hla_utilities  = struct (* Hassle-free HLA/MHC handling *) open Common type predictor_type = [   | `NetMHC   | `NetMHCpan   | `NetMHCIIpan   | `NetMHCcons   | `Random   | `SMM   | `SMM_PMBEC   | `NetMHCpan_IEDB   | `NetMHCcons_IEDB   | `SMM_IEDB   | `SMM_PMBEC_IEDB ] let predictor_to_string = function   | `NetMHC -> "netmhc"   | `NetMHCpan -> "netmhcpan"   | `NetMHCIIpan -> "netmhciipan"   | `NetMHCcons -> "netmhccons"   | `Random -> "random"   | `SMM -> "smm"   | `SMM_PMBEC -> "smm-pmbec"   | `NetMHCpan_IEDB -> "netmhcpan-iedb"   | `NetMHCcons_IEDB -> "netmhccons-iedb"   | `SMM_IEDB -> "smm-iedb"   | `SMM_PMBEC_IEDB -> "smm-pmbec-iedb" let predictor_to_tool ~run_with predictor =   let get_tool t =     let tool =       Machine.get_tool         run_with         Machine.Tool.Definition.(create t)     in     let ensure = Machine.Tool.(ensure tool) in     let init = Machine.Tool.(init tool) in     (ensure, init)   in   match predictor with   | `NetMHC -> Some (get_tool "netMHC")   | `NetMHCpan -> Some (get_tool "netMHCpan")   | `NetMHCIIpan -> Some (get_tool "netMHCIIpan")   | `NetMHCcons -> Some (get_tool "netMHCcons")   | _ -> None (*    Example input (all in):     1,A*03:01,,0.026478,samplename     1,B*37:01',,0.000000,samplename     1, C*06:02,,0.000086,samplename     2,DQA1*01:02 ,,0.000000,samplename   Example output (type-I predictions out as a plain list):     A*03:01     B*37:01     C*06:02     C*07:02   So the following "one-liner"     - extracts the second column     - trims the allele names from white-spaces around them     - gets rid of 's at the end of the allele names     - removes type-II predictions (since we don't support       type-II predictions)   to be able to feed the file into `mhctools` based utilities,   including topiary, vaxrank, and netmhctools itself. *) let sanitize_hlarp_out_for_mhctools ~run_with ~hlarp_result ~output_path =    let open KEDSL in   let input_path = hlarp_result#product#path in   let name =      sprintf       "Extract and sanitize alleles: %s"       (hlarp_result#render#name)   in   let edges = [ depends_on hlarp_result; ] in   let product = single_file ~host:(Machine.as_host run_with) output_path in   let tmp_path = output_path ^ ".tmp" in   let make = Machine.(     run_program run_with       ~requirements:[ `Quick_run; ]       Program.(         shf "cat %s | grep -v '^2' | awk -F , '{ gsub(/^[ \t]+|[ \t]+$/,\"\", $2); print $2}' | tail -n +2 | sed \"s/'//\" > %s && mv %s %s"            input_path tmp_path tmp_path output_path       )     )   in   workflow_node product ~name ~make ~edges  end module Machine  = struct
(**************************************************************************)
(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *) (*                                                                        *) (*  Licensed under the Apache License, Version 2.0 (the "License");       *) (*  you may not use this file except in compliance with the License.      *) (*  You may obtain a copy of the License at                               *) (*                                                                        *) (*      http://www.apache.org/licenses/LICENSE-2.0                        *) (*                                                                        *) (*  Unless required by applicable law or agreed to in writing, software   *) (*  distributed under the License is distributed on an "AS IS" BASIS,     *) (*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *) (*  implied.  See the License for the specific language governing         *) (*  permissions and limitations under the License.                        *)
(**************************************************************************)
open Common open KEDSL module Tool = struct   module Definition = struct     type t = {name: string; version: string option}     let create ?version name  = {name; version}     let to_opam_name {name; version} =       sprintf "%s.%s" name (Option.value ~default:"NOVERSION" version)     let to_string = to_opam_name     let to_directory_name = to_opam_name     let get_version t = t.version     let get_name t = t.name   end   module Default = struct     open Definition     let bwa = create "bwa" ~version:"0.7.10"     let freebayes = create "freebayes" ~version:"1.1.0"     let sambamba = create "sambamba" ~version:"0.6.5"     let samtools = create "samtools" ~version:"1.4"     let bcftools = create "bcftools" ~version:"1.4"     let vcftools = create "vcftools" ~version:"0.1.12b"     let bedtools = create "bedtools" ~version:"2.23.0"     let somaticsniper = create "somaticsniper" ~version:"1.0.3"     let varscan = create "varscan" ~version:"2.3.5"     let mutect = create "mutect" (* We don't know the versions of the users' GATKs *)     let gatk = create "gatk" (* idem, because of their non-open-source licenses *)     let strelka = create "strelka" ~version:"1.0.14"     let virmid = create "virmid" ~version:"1.1.1"     let muse = create "muse" ~version:"1.0b"     let star = create "star" ~version:"2.4.1d"     let stringtie = create "stringtie" ~version:"1.2.2"     let cufflinks = create "cufflinks" ~version:"2.2.1"     let hisat = create "hisat" ~version:"0.1.6-beta"     let hisat2 = create "hisat" ~version:"2.0.2-beta"     let mosaik = create "mosaik" ~version:"2.2.3"     let kallisto = create "kallisto" ~version:"0.42.3"     let bowtie = create "bowtie" ~version:"1.1.2"     let fastqc = create "fastqc" ~version:"0.11.5"     let igvxml = create "igvxml" ~version:"0.1.0"     let hlarp = create "hlarp" ~version:"biokepi-branch"     let samblaster = create "samblaster" ~version:"v.0.1.22"     let delly2 = create "delly2" ~version:"0.7.7"     (* Bioconda *)     let optitype = create "optitype" ~version:"1.2.1-0"     let seqtk = create "seqtk" ~version:"1.2"     let seq2hla = create "seq2hla" ~version:"2.2"     let picard = create "picard" ~version:"2.9.2"     let snpeff = create "snpeff" ~version:"4.3.1m-0"     (* PyPI packages *)     let pyensembl = create "pyensembl" ~version:"1.1.0"     let vcfannotatepolyphen = create "vcf-annotate-polyphen" ~version:"0.1.2"     let topiary = create "topiary" ~version:"1.2.1"     let vaxrank = create "vaxrank" ~version:"0.6.0"     let isovar = create "isovar" ~version:"0.7.0"   end   type t = {     definition: Definition.t;     init: Program.t;     ensure: phony_workflow;   }   let create ?init ?ensure definition = {     definition;     init =       Option.value init         ~default:(Program.shf "echo 'Tool %s: default init'"                     (Definition.to_string definition));     ensure =       Option.value_map         ensure         ~f:KEDSL.forget_product         ~default:(workflow_node nothing                     ~name:(sprintf "%s-ensured"                              (Definition.to_string definition)));   }   let init t = t.init   let ensure t = t.ensure   module Kit = struct     type tool = t     type t = Definition.t -> tool option     let concat : t list -> t =       fun l ->       fun def ->         List.find_map l ~f:(fun kit -> kit def)     let of_list l : t =       fun def ->         List.find l ~f:(fun {definition; _} -> definition = def)     let get_exn t tool =       match t tool with       | Some s -> s       | None ->         failwithf "Toolkit cannot provide the tool %s"           (Definition.to_string tool)   end end
(** Jobs in Biokepi ask the computing environment (defined below in Machine) for resources.

The implementation of the Make_fun.t function defined by the user is free to interpret those requirements according to the user's computing infrastructure. *)

module Make_fun = struct   module Requirement = struct     type t = [       | `Processors of int 
          (** A number of cores on a shared-memory setting. *)
      | `Internet_access 
        (** Able to access public HTTP(S) or FTP URLs. *)
      | `Memory of [           | `GB of float 
              (** Ask for a specific amount of memory. *)
          | `Small 
            (** Tell that the program does not expect HPC-like memory usage (i.e. not more than 2 GB or your usual laptop). *)
          | `Big 
            (** Tell that the program may ask for a lot of memory but you don't know how much precisely. *)
        ]       | `Quick_run 
        (** Programs that run fast, with little resources. Usually, you can interpret this as "OK to run on the login node of my cluster." *)
      | `Spark of string list 
           (** Ask for a Spark (on-YARN) environment with custom parameters (not in use for now, "#WIP"). *)
      | `Custom of string 
          (** Pass arbitrary data (useful for temporary extensions/experiements outside of Biokepi). *)
      | `Self_identification of string list         
        (** Set of names or tags for a workflow-node program to identify itself to the Machine.t. This is useful for quickly bypassing incorrect requirements set in the library (please also report an issue if you need this). *)
    ] [@@deriving yojson, show]   end   type t =     ?name: string ->     ?requirements: Requirement.t list ->     Program.t ->     KEDSL.Build_process.t   
  (** The type of the “run function” used across the library. *)
  
  (** A stream processor, for this purpose, is a program that runs on one core and does not grow in memory arbitrarily. *)
  let stream_processor requirements =     `Processors 1 :: `Memory `Small :: requirements   let quick requirements = `Quick_run :: requirements   let downloading requirements =     `Internet_access :: stream_processor requirements   let with_self_ids ?self_ids l =     match self_ids with     | Some tags -> `Self_identification tags :: l     | None -> l   let with_requirements : t -> Requirement.t list -> t = fun f l ->     fun ?name ?(requirements = []) prog ->       f ?name ~requirements:(l @ requirements) prog end type t = {   name: string;   host: Host.t;   pyensembl_cache_dir: string option;   get_reference_genome: string -> Reference_genome.t;   toolkit: Tool.Kit.t;   run_program: Make_fun.t;   work_dir: string;   max_processors: int; } let create     ~host ?pyensembl_cache_dir ~get_reference_genome ~toolkit     ~run_program ~work_dir ~max_processors  name =   {name; toolkit; pyensembl_cache_dir; get_reference_genome;    host; run_program; work_dir; max_processors} let name t = t.name let as_host ?with_shell t =   match with_shell with   | None -> t.host   | Some shell ->     begin       let open Ketrew_pure in       let shell_key = "shell" in       let org_uri = Host.to_uri t.host in       let uri_no_shell = Uri.remove_query_param org_uri shell_key in       let uri_with_shell =         let shell_str = sprintf "%s,-c" shell in (* as in `bash -c` *)         Uri.add_query_param uri_no_shell (shell_key, [shell_str;])       in       KEDSL.Host.parse (Uri.to_string uri_with_shell)     end let get_pyensembl_cache_dir t = t.pyensembl_cache_dir let get_reference_genome t = t.get_reference_genome let get_tool t tool =   match t.toolkit tool with   | Some s -> s   | None ->     failwithf "Machine %S cannot provide the tool %s"       t.name (Tool.Definition.to_string tool) let run_program t = t.run_program let max_processors t = t.max_processors
(** Get the maximum number of processors that a single job can use in the Machine.t (i.e. usually the “number-of-threads” paramters of most tools) *)
let quick_run_program t : Make_fun.t =   Make_fun.with_requirements t.run_program (Make_fun.quick [])
(** Run a program that does not use much memory and runs on one core. *)
let run_stream_processor ?self_ids t : Make_fun.t =   Make_fun.with_requirements t.run_program     (Make_fun.stream_processor [] |> Make_fun.with_self_ids ?self_ids)
(** Run a program that does not use much memory, runs on one core, and needs the internet. *)
let run_download_program t : Make_fun.t =   Make_fun.with_requirements t.run_program (Make_fun.downloading []) let run_big_program t :   ?processors: int -> ?self_ids : string list -> Make_fun.t =   fun ?(processors = 1) ?self_ids ->     Make_fun.with_requirements       t.run_program       (Make_fun.with_self_ids ?self_ids [`Memory `Big`Processors processors]) let work_dir t = t.work_dir end module Metadata  = struct
(** Metadata Module Generated by the Build System *)
(** Official version string of the current build *)
let version = lazy "0.0.0+dev"
(** Current Git commit (if avaiable at build-time) *)
let git_commit = Some "c07982e85f581fc4972b5a6f8d601649e56a4f34"
(** Current result of "git describe" (if avaiable at build-time) *)
let git_description = Some "biokepi.0.0.0-794-gc07982e" end module Reference_genome  : sig
(**************************************************************************)
(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *) (*                                                                        *) (*  Licensed under the Apache License, Version 2.0 (the "License");       *) (*  you may not use this file except in compliance with the License.      *) (*  You may obtain a copy of the License at                               *) (*                                                                        *) (*      http://www.apache.org/licenses/LICENSE-2.0                        *) (*                                                                        *) (*  Unless required by applicable law or agreed to in writing, software   *) (*  distributed under the License is distributed on an "AS IS" BASIS,     *) (*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *) (*  implied.  See the License for the specific language governing         *) (*  permissions and limitations under the License.                        *)
(**************************************************************************)
(** Representation of Reference Genomes *)
open Common type name = string module Specification : sig   module Location : sig     type t = [       | `Url of string       | `Vcf_concat of (string * t) list (* name × location *)       | `Concat of t list       | `Gunzip of t (* Should this be guessed from the URL extension? *)       | `Bunzip2 of t       | `Untar of t     ]     val url : '-> [> `Url of 'a ]     val vcf_concat : '-> [> `Vcf_concat of 'a ]     val concat : '-> [> `Concat of 'a ]     val gunzip : '-> [> `Gunzip of 'a ]     val bunzip2 : '-> [> `Bunzip2 of 'a ]     val untar : '-> [> `Untar of 'a ]   end   type t = private {     name : name;     ensembl : int;     species : string;     metadata : string option;     fasta : Location.t;     dbsnp : Location.t option;     known_indels : Location.t option;     cosmic : Location.t option;     exome_gtf : Location.t option;     cdna : Location.t option;     whess : Location.t option;     major_contigs : string list option;     snpeff_name : string option;   }   val create :     ?metadata:string ->     fasta:Location.t ->     ensembl:int ->     species:string ->     ?dbsnp:Location.t ->     ?known_indels:Location.t ->     ?cosmic:Location.t ->     ?exome_gtf:Location.t ->     ?cdna:Location.t ->     ?whess:Location.t ->     ?major_contigs:string list ->     ?snpeff_name:string ->     string ->     t   module Default :   sig     module Name : sig       
      (** The “names” of the default genomes; the values are provided to simplify code and make it less typo-error-prone but the string can be ipused directly (e.g. b37 is just "b37"). *)
      val b37 : name       val b37decoy : name       val b38 : name       val hg38: name       val hg18 : name       val hg19 : name       val mm10 : name     end     val b37 : t     val b37decoy : t     val b38 : t     val hg38 : t     val hg18 : t     val hg19 : t     val mm10 : t   end end type t = private {   specification: Specification.t;   location : KEDSL.file_workflow;   cosmic :  KEDSL.file_workflow option;   dbsnp :  KEDSL.file_workflow option;   known_indels : KEDSL.file_workflow option;   gtf : KEDSL.file_workflow option;   cdna : KEDSL.file_workflow option;   whess : KEDSL.file_workflow option; }
(** A reference genome has a name (for display/matching) and a cluster-dependent path. Corresponding Cosmic and dbSNP databases (VCFs) can be added to the mix. *)
val create :   ?cosmic:KEDSL.file_workflow ->   ?dbsnp:KEDSL.file_workflow ->   ?known_indels:KEDSL.file_workflow ->   ?gtf:KEDSL.file_workflow ->   ?cdna:KEDSL.file_workflow ->   ?whess:KEDSL.file_workflow ->   Specification.t -> KEDSL.file_workflow -> t
(** Build a Reference_genome.t record. *)
(**
Usual Accessors
*)
val name : t -> name val ensembl : t -> int val species : t -> string val path : t -> string val cosmic_path_exn : t -> string val dbsnp_path_exn : t -> string val known_indels_path_exn : t -> string val gtf_path_exn : t -> string val cdna_path_exn : t -> string val whess_path_exn : t -> string val snpeff_name_exn: t -> string val major_contigs : t -> Region.t list
(**
Targets
*)
val fasta: t -> KEDSL.file_workflow val cosmic_exn: t -> KEDSL.file_workflow val dbsnp_exn: t -> KEDSL.file_workflow val known_indels_exn: t -> KEDSL.file_workflow val gtf_exn: t -> KEDSL.file_workflow val gtf: t -> KEDSL.file_workflow option val cdna_exn: t -> KEDSL.file_workflow val whess_exn: t -> KEDSL.file_workflow end  = struct open Common type name = string module Specification = struct   module Location = struct     type t = [       | `Url of string       | `Vcf_concat of (string * t) list (* name × location *)       | `Concat of t list       | `Gunzip of t (* Should this be guessed from the URL extension? *)       | `Bunzip2 of t       | `Untar of t     ]     let url u = `Url u     let vcf_concat l = `Vcf_concat l     let concat l = `Concat l     let gunzip l = `Gunzip l     let bunzip2 l = `Bunzip2 l     let untar l = `Untar l   end   type t = {     name: string;     ensembl: int;     species: string;     metadata: string option;     fasta: Location.t;     dbsnp: Location.t option;     known_indels: Location.t option;     cosmic: Location.t option;     exome_gtf: Location.t option; (* maybe desrves a better name? *)     cdna: Location.t option;     whess: Location.t option;     major_contigs: string list option;     snpeff_name: string option;   }   let create       ?metadata       ~fasta       ~ensembl       ~species       ?dbsnp       ?known_indels       ?cosmic       ?exome_gtf       ?cdna       ?whess       ?major_contigs       ?snpeff_name       name = {     name;     ensembl;     species;     metadata;     fasta;     dbsnp;     known_indels;     cosmic;     exome_gtf;     cdna;     whess;     major_contigs;     snpeff_name;   } module Default = struct   let major_contigs_b37 =     List.init 22 (fun i -> sprintf "%d" (i + 1))     @ ["X""Y""MT";]   let major_contigs_hg_family =     List.init 22 (fun i -> sprintf "chr%d" (i + 1))     @ [       "chrX";       "chrY";       "chrM";     ]   let major_contigs_mm10 =     List.init 19 (fun i -> sprintf "%d" (i + 1))     @ [ "X""Y" ]   module Name = struct     let b37 = "b37"     let b37decoy = "b37decoy"     let b38 = "b38"     let hg38 = "hg38"     let hg18 = "hg18"     let hg19 = "hg19"     let mm10 = "mm10"   end   (* Used by both B37 and B37decoy *)   let b37_dbsnp_url =     "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/dbsnp_138.b37.vcf.gz"   let b37_cosmic_url =     "http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/b37_cosmic_v54_120711.vcf"   let b37_exome_gtf_url =     "http://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz"   let b37_cdna_url =     "http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"   let b37_whess_url =     "ftp://genetics.bwh.harvard.edu/pph2/whess/polyphen-2.2.2-whess-2011_12.sqlite.bz2"   let b37_known_indels_url =     "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/Mills_and_1000G_gold_standard.indels.b37.vcf.gz"   let human = "homo sapiens"   let mouse = "mus musculus"   let b37 =     create Name.b37       ~species:human       ~ensembl:75       ~metadata:"Provided by the Biokepi library"       ~major_contigs:major_contigs_b37       ~fasta:Location.(           url "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/human_g1k_v37.fasta.gz"           |> gunzip)       ~dbsnp:Location.(url b37_dbsnp_url |> gunzip)       (* Alternate?          "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/v4.0/00-All.vcf.gz"       *)       ~known_indels:Location.(url b37_known_indels_url |> gunzip)       ~cosmic:Location.(url b37_cosmic_url)       ~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)       ~cdna:Location.(url b37_cdna_url |> gunzip)       ~whess:Location.(url b37_whess_url |> bunzip2)       ~snpeff_name:"GRCh37.75"   let b37decoy =     create Name.b37decoy       ~species:human       ~ensembl:75       ~metadata:"Provided by the Biokepi library"       ~major_contigs:major_contigs_b37       ~fasta:Location.(           url             "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"           |> gunzip)       ~dbsnp:Location.(url b37_dbsnp_url |> gunzip)       ~known_indels:Location.(url b37_known_indels_url |> gunzip)       ~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)       ~cosmic:Location.(url b37_cosmic_url)       ~cdna:Location.(url b37_cdna_url |> gunzip)       ~whess:Location.(url b37_whess_url |> bunzip2)       ~snpeff_name:"GRCh37.75"   let hg38 =     (* Release 87 *)     let hg38_url =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.fasta.gz" in     let dbsnp_hg38 =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.dbsnp.vcf.gz" in     let known_indels_hg38 =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.known_indels.vcf.gz" in     create Name.hg38       ~species:human       ~ensembl:87       ~metadata:"Provided by the Biokepi library"       ~major_contigs:major_contigs_hg_family       ~fasta:Location.(url hg38_url|> gunzip)       ~dbsnp:Location.(url dbsnp_hg38 |> gunzip)       ~known_indels:Location.(url known_indels_hg38 |> gunzip)       ~snpeff_name:"GRCh38.86"   let b38 =     (* Release 87 *)     let b38_url =       "http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" in     let gtf_b38_url =       "http://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz" in     let cdna_b38_url =       "http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" in     let dbsnp_url =       "http://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/common_all_20170710.vcf.gz" in     create Name.b38       ~species:human       ~ensembl:87       ~metadata:"Provided by the Biokepi library"       ~major_contigs:major_contigs_b37       ~fasta:Location.(url b38_url |> gunzip)       ~exome_gtf:Location.(url gtf_b38_url |> gunzip)       ~dbsnp:Location.(url dbsnp_url |> gunzip)       ~cdna:Location.(url cdna_b38_url |> gunzip)       ~snpeff_name:"GRCh38.86"   let hg18 =     let hg18_url =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/Homo_sapiens_assembly18.fasta.gz" in     let dbsnp_hg18_url =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/dbsnp_138.hg18.vcf.gz" in     create Name.hg18       ~ensembl:54       ~species:human       ~metadata:"Provided by the Biokepi library"       ~major_contigs:major_contigs_hg_family       ~fasta:Location.(url hg18_url|> gunzip)       ~dbsnp:Location.(url dbsnp_hg18_url |> gunzip)   let hg19 =     let hg19_url =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/ucsc.hg19.fasta.gz" in     let dbsnp_hg19_url =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/dbsnp_138.hg19.vcf.gz" in     let known_indels_hg19_url =       "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" in     create Name.hg19       ~ensembl:75       ~species:human       ~metadata:"Provided by the Biokepi library"       ~major_contigs:major_contigs_hg_family       ~fasta:Location.(url hg19_url|> gunzip)       ~dbsnp:Location.(url dbsnp_hg19_url |> gunzip)       ~known_indels:Location.(url known_indels_hg19_url |> gunzip)       ~whess:Location.(url b37_whess_url |> bunzip2)       ~snpeff_name:"hg19"   let mm10 =     let mm10_url =       "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/mm10.GRCm38.dna_sm.fa" in     let dbsnp_mm10_snps_url =       "ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.snps.rsIDdbSNPv137.vcf.gz" in     let dbsnp_mm10_indels_url =       "ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.indels.rsIDdbSNPv137.vcf.gz" in     let gene_annotations_gtf =       "ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz" in     let cdna_mm10_url =       "ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" in     create Name.mm10       ~ensembl:87       ~species:mouse       ~metadata:"Provided by the Biokepi Library"       ~major_contigs:major_contigs_mm10       ~fasta:Location.(url mm10_url |> gunzip)       ~dbsnp:Location.(           vcf_concat ["db_snps.vcf", url dbsnp_mm10_snps_url |> gunzip;                       "db_indels.vcf", url dbsnp_mm10_indels_url |> gunzip]         )       ~exome_gtf:Location.(url gene_annotations_gtf |> gunzip)       ~cdna:Location.(url cdna_mm10_url |> gunzip)       ~snpeff_name:"mm10" end end
(** A reference genome has a name (for display/matching) and a cluster-dependent path. Corresponding Cosmic and dbSNP databases (VCFs) can be added to the mix. *)
type t = {   specification: Specification.t;   location: KEDSL.file_workflow;   cosmic:  KEDSL.file_workflow option;   dbsnp:  KEDSL.file_workflow option;   known_indels:  KEDSL.file_workflow option;   gtf:  KEDSL.file_workflow option;   cdna: KEDSL.file_workflow option;   whess: KEDSL.file_workflow option; } let create ?cosmic ?dbsnp ?known_indels ?gtf ?cdna ?whess specification location =   {specification; location; cosmic; dbsnp; known_indels; gtf; cdna; whess} let name t = t.specification.Specification.name let ensembl t = t.specification.Specification.ensembl let species t = t.specification.Specification.species let snpeff_name_exn t =    Option.value_exn      ~msg:(sprintf "%s: no snpEff name" (name t))     t.specification.Specification.snpeff_name let path t = t.location#product#path let cosmic_path_exn t =   let msg = sprintf "cosmic_path_exn of %s" (name t) in   let cosmic = Option.value_exn ~msg t.cosmic in   cosmic#product#path let dbsnp_path_exn t =   let msg = sprintf "dbsnp_path_exn of %s" (name t) in   let trgt = Option.value_exn ~msg t.dbsnp in   trgt#product#path let known_indels_path_exn t =   let msg = sprintf "known_indels_path_exn of %s" (name t) in   let trgt = Option.value_exn ~msg t.known_indels in   trgt#product#path let gtf_path_exn t =   let msg = sprintf "gtf_path_exn of %s" (name t) in   let trgt = Option.value_exn ~msg t.gtf in   trgt#product#path let cdna_path_exn t =     let msg = sprintf "cdna_path_exn of %s" (name t) in     let target = Option.value_exn ~msg t.cdna in     target#product#path let whess_path_exn t =     let msg = sprintf "whess_path_exn of %s" (name t) in     let target = Option.value_exn ~msg t.whess in     target#product#path let fasta: t -> KEDSL.file_workflow = fun t -> t.location let cosmic_exn t =   Option.value_exn ~msg:(sprintf "%s: no COSMIC" (name t)) t.cosmic let dbsnp_exn t =   Option.value_exn ~msg:(sprintf "%s: no DBSNP" (name t)) t.dbsnp let known_indels_exn t =   Option.value_exn ~msg:(sprintf "%s: no Known Indels" (name t)) t.known_indels let gtf_exn t =   Option.value_exn ~msg:(sprintf "%s: no GTF" (name t)) t.gtf let gtf t = t.gtf let cdna_exn t =   Option.value_exn ~msg:(sprintf "%s: no cDNA fasta file" (name t)) t.cdna let whess_exn t =   Option.value_exn ~msg:(sprintf "%s: no WHESS file" (name t)) t.whess let major_contigs t : Region.t list =   match t.specification.Specification.major_contigs with   | None ->     failwithf "Reference %S does have major-contigs/chromosomes defined" (name t)   | Some l -> List.map l ~f:(fun s -> `Chromosome s) end module Region  = struct
(**************************************************************************)
(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *) (*                                                                        *) (*  Licensed under the Apache License, Version 2.0 (the "License");       *) (*  you may not use this file except in compliance with the License.      *) (*  You may obtain a copy of the License at                               *) (*                                                                        *) (*      http://www.apache.org/licenses/LICENSE-2.0                        *) (*                                                                        *) (*  Unless required by applicable law or agreed to in writing, software   *) (*  distributed under the License is distributed on an "AS IS" BASIS,     *) (*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *) (*  implied.  See the License for the specific language governing         *) (*  permissions and limitations under the License.                        *)
(**************************************************************************)
open Common
(** Positions are 1-based *)
type t = [   | `Chromosome of string   | `Chromosome_interval of string * int * int   | `Full ]
(** Make a filename-compliant string out of a region specification. *)
let to_filename = function | `Full -> "Full" | `Chromosome s -> sprintf "%s" s | `Chromosome_interval (s, b, e) -> sprintf "%s_%d-%d" s b e let to_samtools_specification = function | `Full -> None | `Chromosome s -> Some s | `Chromosome_interval (s, b, e) -> Some (sprintf "%s:%d-%d" s b e) let to_samtools_option r =   match to_samtools_specification r with   | Some s -> sprintf "-r %s" s   | None -> "" let to_gatk_option r =   match to_samtools_specification r with   | Some s -> sprintf "--intervals %s" s   | None -> "" let parse_samtools s =   match String.split ~on:(`Character ':') s with   | [] -> assert false   | [one] -> `Chromosome one   | [one; two] ->     begin match String.split ~on:(`Character '-') two with     | [left; right] ->       begin match Int.of_string left, Int.of_string right with       | Some b, Some e -> `Chromosome_interval (one, b, e)       | _ -> failwithf "Cannot parse %S into 2 loci" two       end     | _ -> failwithf "Not one '-' in %S" two     end   | _ -> failwithf "Not one or zero ':' in %S" s let cmdliner_term () =   let open Cmdliner in   Term.(     pure (function       | None -> `Full       | Some s -> parse_samtools s)     $ Arg.(         value & opt (some string) None         & info ["R""region"] ~docv:"REGION"           ~doc:"Specify a region; using samtools' format"       )   ) end module Tool_parameters  = struct open Common type t = {   name: string;   parameters: (string * string) list; } let to_json t: Yojson.Basic.json =   let {name; parameters} = t in   `Assoc [     "name"`String name;     "parameters",     `Assoc (List.map parameters ~f:(fun (a, b) -> a, `String b));   ] let render {parameters; _} =   List.concat_map parameters ~f:(fun (a,b) -> [a; b]) end module Workflow_utilities  = struct
(** Small/useful workflow-nodes. *)
open Common module Remove = struct   let file ~run_with path =     let open KEDSL in     workflow_node nothing       ~name:(sprintf "rm-%s" (Filename.basename path))       ~ensures:(`Is_verified (`Command_returns (           Command.shell ~host:Machine.(as_host run_with)             (sprintf "ls %s" path),           2)))       ~make:(Machine.quick_run_program                run_with Program.(exec ["rm""-f"; path]))       ~tags:[Target_tags.clean_up]   let directory ~run_with path =     let open KEDSL in     workflow_node nothing       ~name:(sprintf "rmdir-%s" (Filename.basename path))       ~ensures:(`Is_verified (`Command_returns (           Command.shell ~host:Machine.(as_host run_with)             (sprintf "ls %s" path),           2         )))       ~make:(Machine.quick_run_program                run_with Program.(exec ["rm""-rf"; path]))       ~tags:[Target_tags.clean_up]   (* This one is dirtier, it does not check its result and uses the `Host.t`      directly, it should be used only when the `Machine.t` is not available      (i.e. while defining a `Machine.t`). *)   let path_on_host ~host path =     let open KEDSL in     workflow_node nothing       ~name:(sprintf "rm-%s" (Filename.basename path))       ~make:(daemonize ~using:`Python_daemon ~host                Program.(exec ["rm""-rf"; path])) end module Gunzip = struct   
  (** Example: call "gunzip <list of fastq.gz files> > some_name_cat.fastq". *)
  let concat ~(run_with : Machine.t) bunch_of_dot_gzs ~result_path =     let open KEDSL in     let program =       Program.(         exec ["mkdir""-p"Filename.dirname result_path]         && shf "gunzip -c  %s > %s"           (List.map bunch_of_dot_gzs              ~f:(fun o -> Filename.quote o#product#path)            |> String.concat ~sep:" ") result_path       ) in     let name =       sprintf "gunzipcat-%s" (Filename.basename result_path) in     workflow_node       (single_file result_path ~host:Machine.(as_host run_with))       ~name       ~make:(Machine.run_stream_processor ~name run_with  program)       ~edges:(         on_failure_activate Remove.(file ~run_with result_path)         :: List.map ~f:depends_on bunch_of_dot_gzs) end module Cat = struct   let concat ~(run_with : Machine.t) bunch_of_files ~result_path =     let open KEDSL in     let program =       Program.(         exec ["mkdir""-p"Filename.dirname result_path]         && shf "cat %s > %s"           (List.map bunch_of_files              ~f:(fun o -> Filename.quote o#product#path)            |> String.concat ~sep:" ") result_path       ) in     let name =       sprintf "concat-all-%s" (Filename.basename result_path) in     workflow_node       (single_file result_path ~host:Machine.(as_host run_with))       ~name       ~edges:(         on_failure_activate Remove.(file ~run_with result_path)         :: List.map ~f:depends_on bunch_of_files)       ~make:(Machine.run_stream_processor run_with ~name  program)   let cat_folder ~host       ~(run_program : Machine.Make_fun.t)       ?(depends_on=[]) ~files_gzipped ~folder ~destination =      let deps = depends_on in     let open KEDSL in     let name = "cat-folder-" ^ Filename.quote folder in     let edges =       on_failure_activate (Remove.path_on_host ~host destination)       :: List.map ~f:depends_on deps in     if files_gzipped then (       workflow_node (single_file destination ~host)         ~edges ~name         ~make:(           run_program ~name             Program.(               shf "gunzip -c %s/* > %s" (Filename.quote folder)                 (Filename.quote destination)))     ) else (       workflow_node         (single_file destination ~host)         ~edges ~name         ~make:(           run_program ~name             Program.(               shf "cat %s/* > %s" (Filename.quote folder) (Filename.quote destination)))     ) end module Download = struct   let wget_program ?output_filename url =     KEDSL.Program.exec [       "wget";       "-O"Option.value output_filename ~default:Filename.(basename url);       url     ]   let wget_to_folder       ~host ~(run_program : Machine.Make_fun.t)       ~test_file ~destination url  =     let open KEDSL in     let name = "wget-" ^ Filename.basename destination in     let test_target = destination // test_file in     workflow_node (single_file test_target ~host) ~name       ~make:(         run_program ~name           ~requirements:(Machine.Make_fun.downloading [])           Program.(             exec ["mkdir""-p"; destination]             && shf "wget %s -P %s"               (Filename.quote url)               (Filename.quote destination)))       ~edges:[         on_failure_activate (Remove.path_on_host ~host destination);       ]   let wget       ~host ~(run_program : Machine.Make_fun.t)       url destination =     let open KEDSL in     let name = "wget-" ^ Filename.basename destination in     workflow_node       (single_file destination ~host) ~name       ~make:(         run_program ~name           ~requirements:(Machine.Make_fun.downloading [])           Program.(             exec ["mkdir""-p"Filename.dirname destination]             && shf "wget %s -O %s"               (Filename.quote url) (Filename.quote destination)))       ~edges:[         on_failure_activate (Remove.path_on_host ~host destination);       ]   let wget_gunzip       ~host ~(run_program : Machine.Make_fun.t)       ~destination url =     let open KEDSL in     let is_gz = Filename.check_suffix url ".gz" in     if is_gz then (       let name = "gunzip-" ^ Filename.basename (destination ^ ".gz"in       let wgot = wget ~host ~run_program url (destination ^ ".gz"in       workflow_node         (single_file destination ~host)         ~edges:[           depends_on (wgot);           on_failure_activate (Remove.path_on_host ~host destination);         ]         ~name         ~make:(           run_program ~name             ~requirements:(Machine.Make_fun.stream_processor [])             Program.(shf "gunzip -c %s > %s"                        (Filename.quote wgot#product#path)                        (Filename.quote destination)))     ) else (       wget ~host ~run_program url destination     )   let wget_bunzip2       ~host ~(run_program : Machine.Make_fun.t)       ~destination url =     let open KEDSL in     let is_bz2 = Filename.check_suffix url ".bz2" in     if is_bz2 then (       let name = "bunzip2-" ^ Filename.basename (destination ^ ".bz2"in       let wgot = wget ~host ~run_program url (destination ^ ".bz2"in       workflow_node         (single_file destination ~host)         ~edges:[           depends_on (wgot);           on_failure_activate (Remove.path_on_host ~host destination);         ]         ~name         ~make:(           run_program ~name             ~requirements:(Machine.Make_fun.stream_processor [])             Program.(shf "bunzip2 -c %s > %s"                        (Filename.quote wgot#product#path)                        (Filename.quote destination)))     ) else (       wget ~host ~run_program url destination     )   let wget_untar       ~host ~(run_program : Machine.Make_fun.t)       ~destination_folder ~tar_contains url =     let open KEDSL in     let zip_flags =       let is_gz = Filename.check_suffix url ".gz" in       let is_bzip = Filename.check_suffix url ".bz2" in       if is_gz then "z" else if is_bzip then "j" else ""     in     let tar_filename = (destination_folder // "archive.tar"in     let name = "untar-" ^ tar_filename in     let wgot = wget ~host ~run_program url tar_filename in     let file_in_tar = (destination_folder // tar_contains) in     workflow_node       (single_file file_in_tar ~host)       ~edges:[         depends_on (wgot);         on_failure_activate (Remove.path_on_host ~host destination_folder);       ]       ~name       ~make:(         run_program ~name           ~requirements:(Machine.Make_fun.stream_processor [])           Program.(             exec ["mkdir""-p"; destination_folder]             && shf "tar -x%s -f %s -C %s"               zip_flags               (Filename.quote wgot#product#path)               (Filename.quote destination_folder)))          type tool_file_location = [     | `Scp of string     | `Wget of string     | `Fail of string   ]   let get_tool_file       ~identifier       ~(run_program : Machine.Make_fun.t)       ~host ~install_path       loc =     let open KEDSL in     let rm_path = Remove.path_on_host in     let jar_name =       match loc with       | `Fail s -> sprintf "cannot-get-%s.file" identifier       | `Scp s -> Filename.basename s       | `Wget s -> Filename.basename s in     let local_box_path = install_path // jar_name in     workflow_node (single_file local_box_path ~host)       ~name:(sprintf "get-%s" jar_name)       ~edges:[         on_failure_activate (rm_path ~host local_box_path)       ]       ~make:(         run_program           ~requirements:[             `Internet_access;             `Self_identification [identifier ^ "-instalation"; jar_name];           ]           Program.(             shf "mkdir -p %s" install_path             && begin match loc with             | `Fail msg ->               shf "echo 'Cannot download file for %s: %s'" identifier msg               && sh "exit 4"             | `Scp s ->               shf "scp %s %s"                 (Filename.quote s) (Filename.quote local_box_path)             | `Wget s ->               shf "wget %s -O %s"                 (Filename.quote s) (Filename.quote local_box_path)             end))   let gsutil_cp       ~(run_program : Machine.Make_fun.t)       ~host ~url ~local_path =     let open KEDSL in     workflow_node (single_file ~host local_path)       ~name:(sprintf "GSUtil-CP: %s" (Filename.basename local_path))       ~edges:[         on_failure_activate (Remove.path_on_host ~host local_path)       ]       ~make:(         run_program           ~requirements:[             `Internet_access;             `Self_identification ["gsutil-cp"; url];           ]           Program.(             shf "mkdir -p %s" (Filename.dirname local_path)             && exec ["gsutil""cp"; url; local_path]           )       ) end module Vcftools = struct   
  (** Call a command on a list of ~vcfs to produce a given ~final_vcf (hence the n-to-1 naming). *)
  let vcf_process_n_to_1_no_machine       ~host       ~vcftools       ~(run_program : Machine.Make_fun.t)       ?(more_edges = [])       ~vcfs       ~make_product       ~final_vcf       command_prefix     =     let open KEDSL in     let name = sprintf "%s-%s" command_prefix (Filename.basename final_vcf) in     let make =       run_program ~name         Program.(           Machine.Tool.(init vcftools)           && shf "%s %s > %s"             command_prefix             (String.concat ~sep:" "                (List.map vcfs ~f:(fun t -> Filename.quote t#product#path)))             final_vcf         ) in     workflow_node ~name       (make_product final_vcf)       ~make       ~edges:(         on_failure_activate           (Remove.path_on_host ~host final_vcf)         :: depends_on Machine.Tool.(ensure vcftools)         :: List.map ~f:depends_on vcfs         @ more_edges)   
  (** Concatenate VCF files.

We use this version where we don't yet have a Machine.t, as in "download_reference_genome.ml". *)

  let vcf_concat_no_machine       ~host       ~vcftools       ~(run_program : Machine.Make_fun.t)       ?more_edges       ~make_product       vcfs       ~final_vcf =     vcf_process_n_to_1_no_machine       ~make_product       ~host ~vcftools ~run_program ?more_edges ~vcfs ~final_vcf       "vcf-concat"   
  (** Sort a VCF file by choromosome position (it uses "vcf-sort" which itself relies on the "sort" unix tool having the "--version-sort" option).

We use this version where we don't yet have a Machine.t, as in "download_reference_genome.ml". *)

  let vcf_sort_no_machine       ~host       ~vcftools       ~(run_program : Machine.Make_fun.t)       ?more_edges       ~make_product       ~src ~dest () =     let run_program =       Machine.Make_fun.with_requirements run_program [`Memory `Bigin      vcf_process_n_to_1_no_machine       ~make_product       ~host ~vcftools ~run_program ?more_edges ~vcfs:[src] ~final_vcf:dest       "vcf-sort -c" end module Variable_tool_paths = struct   let single_file ~run_with ~tool path =     let open KEDSL in     let condition =       let init = Machine.Tool.init tool in       let host = Machine.as_host ~with_shell:"bash" run_with in       let condition_cmd =         Ketrew_pure.Program.to_single_shell_command           Program.(init && shf "test -e %s" path)       in KEDSL.Command.shell ~host condition_cmd     in     object       method is_done = Some (`Command_returns (condition, 0))     end end end