Biokepi API : Biokepi_run

(* code generated with [./tools/build-doc.sh ketrew,ppx_deriving.std] *)
module Common 
= struct

(**************************************************************************)


(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *)
(*                                                                        *)
(*  Licensed under the Apache License, Version 2.0 (the "License");       *)
(*  you may not use this file except in compliance with the License.      *)
(*  You may obtain a copy of the License at                               *)
(*                                                                        *)
(*      http://www.apache.org/licenses/LICENSE-2.0                        *)
(*                                                                        *)
(*  Unless required by applicable law or agreed to in writing, software   *)
(*  distributed under the License is distributed on an "AS IS" BASIS,     *)
(*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *)
(*  implied.  See the License for the specific language governing         *)
(*  permissions and limitations under the License.                        *)

(**************************************************************************)

(** Module opened by default (like Pervasives) for our library. *)

(** A Non-standard mini library. *)


include Nonstd

(** A String module with more capabilities *)


module String = struct
  include Sosa.Native_string
end

let (//) = Filename.concat

(** path // filename will concat filename to the end of path. *)



let debug_mode =
  ref (try Sys.getenv "BIOKEPI_DEBUG" = "true" with _ -> false)
let dbg fmt = ksprintf (fun s ->
    if !debug_mode
    then eprintf "biokepi-debug: %s\n%!" s
    else ()
  ) fmt

(** A consistent debugging mechanism. *)



let failwithf fmt = ksprintf failwith fmt

(** A formatted failwith. *)



module Unique_id = struct
  include Ketrew_pure.Internal_pervasives.Unique_id
end

(**

Generate unique filenames for a given set of uniquely identifying properties.

The module generates names that remain (way) under 255 bytes because they are the most common file-system limits in 2016.


module Name_file = struct
  (* Additional safety: we check that there are no hash-duplicates
     within a given execution of a program linking with Biokepi. *)
  let db : (string, string list) Hashtbl.t = Hashtbl.create 42

  let path ~readable_suffix ~from high_level_components =
    let sanitize =
      String.map
        ~f:(function
          | ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-') as c -> c
          | other -> '_') in
    let components =
      begin match from with
      | `Path p ->
        let b = Filename.basename p in
        (try [Filename.chop_extension b] with _ -> [b])
      | `In_dir d -> []
      end
      @
      List.map high_level_components ~f:sanitize
    in
    let hash =
      String.concat ~sep:"-" (readable_suffix :: components)
      |> Digest.string |> Digest.to_hex
    in
    let max_length = 220 in
    let buf = Buffer.create max_length in
    Buffer.add_string buf hash;
    let rec append_components =
      function
      | [] -> ()
      | one :: more ->
        if
          Buffer.length buf + String.length readable_suffix
          + String.length one < max_length
        then (Buffer.add_string buf one; append_components more)
        else ()
    in
    append_components components;
    Buffer.add_string buf readable_suffix;
    let name = Buffer.contents buf in
    begin if String.length name > max_length then
        ksprintf failwith "Name_file: filename too long %s (max: %d)"
          name max_length
    end;
    begin match Hashtbl.find db name with
    | some
      when List.sort ~cmp:String.compare some
           = List.sort ~cmp:String.compare components -> ()
    | some ->
      ksprintf failwith "Duplicate filename for different components\nFilename: %s\nPrevious: [%s]\nNew: [%s]\n"
        name (String.concat ~sep:", " some) (String.concat ~sep:", " components)
    | exception _ ->
      Hashtbl.add db name components
    end;
    begin match from with
    | `In_dir s -> s // name
    | `Path p -> Filename.dirname p // name
    end

  let from_path ~readable_suffix p c =
    path ~readable_suffix ~from:(`Path p) c

  let in_directory ~readable_suffix p c =
    path ~readable_suffix ~from:(`In_dir p) c

end

(**

This is an experimental extension of Ketrew's EDSL. If we're happy with it we'll push it upstream.

The idea is carry around a type parameter to have arbitrary products.


module KEDSL = struct

  include Ketrew.EDSL
  module Command = Ketrew_pure.Target.Command

  type nothing = < is_done : Condition.t option >
  let nothing  = object method is_done = None end

  let target _ = `Please_KEDSL_workflow
  let file_target _ = `Please_KEDSL_workflow


  type file_workflow = single_file workflow_node
  type phony_workflow = nothing workflow_node

  type fastq_reads = <
    is_done: Ketrew_pure.Target.Condition.t option;
    paths : string * (string option);
    r1 : single_file;
    r2 : single_file option;
    sample_name: string;
    escaped_sample_name: string;
    fragment_id: string option;
    fragment_id_forced: string;
  >
  let fastq_reads ?host ?name ?fragment_id r1 r2_opt : fastq_reads =
    object (self)
      val r1_file = single_file ?host r1
      val r2_file_opt = Option.map r2_opt ~f:(single_file ?host)
      method r1 = r1_file
        (* workflow_node r1_file
          ~name:(sprintf "File: %s" (Filename.basename r1_file#path)) *)
      method r2 =  r2_file_opt
        (* Option.map r2_file_opt ~f:(fun r2_file ->
            workflow_node r2_file
              ~name:(sprintf "File: %s" (Filename.basename r2_file#path))
          ) *)
      method paths = (r1, r2_opt)
      method is_done =
        Some (match r2_file_opt with
          | Some r2 -> `And [r1_file#exists; r2#exists]
          | None -> `And [r1_file#exists; r1_file#exists;])
      method sample_name =
        Option.value name ~default:(Filename.basename r1)
      method fragment_id = fragment_id
      method fragment_id_forced =
        Option.value fragment_id ~default:(Filename.basename r1)
      method escaped_sample_name =
        String.map self#sample_name ~f:(function
          | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' | '-' | '_' as c -> c
          | other -> '_')
    end

  let transform_fastq_reads
      ?name ?fragment_id
      (fq_reads: fastq_reads) r1 r2_opt
    : fastq_reads
    = 
    fastq_reads 
      ~host:fq_reads#r1#host
      ~name:(match name with Some n -> n | None -> fq_reads#sample_name)
      ?fragment_id:(
        match fragment_id with
        | Some fi -> fi
        | None -> fq_reads#fragment_id)
      r1 r2_opt

  let read_1_file_node (fq : fastq_reads workflow_node) =
    let product = fq#product#r1 in
    workflow_node product
      ~name:(sprintf "READ1 of %s-%s"
               fq#product#sample_name
               fq#product#fragment_id_forced)
      ~equivalence:`None
      ~edges:[depends_on fq]

  let read_2_file_node (fq : fastq_reads workflow_node) =
    Option.map fq#product#r2 ~f:(fun product ->
        workflow_node product
          ~name:(sprintf "READ2 of %s-%s"
                   fq#product#sample_name
                   fq#product#fragment_id_forced)
          ~equivalence:`None
          ~edges:[depends_on fq]
      )

(** Create a fastq_reads workflow_node from one or two single_file workflow_node(s). *)


  let fastq_node_of_single_file_nodes
      ~host ~name ?fragment_id fastq_r1 fastq_r2 =
    let product =
      let r2 = Option.map fastq_r2 ~f:(fun r -> r#product#path) in
      fastq_reads ~host ~name ?fragment_id fastq_r1#product#path r2
    in
    let edges =
      match fastq_r2 with
      | Some r2 -> [depends_on fastq_r1; depends_on r2]
      | None -> [depends_on fastq_r1]
    in
    workflow_node product
      ~equivalence:`None
      ~name:(sprintf "Assembled-fastq: %s (%s)"
               name (Option.value fragment_id
                       ~default:(Filename.basename fastq_r1#product#path)))
      ~edges

  let transform_single_file ~path f =
      single_file ~host:f#host path

  type bam_file = <
    is_done: Ketrew_pure.Target.Condition.t option;
    host: Host.t;
    path : string;
    sample_name: string;
    escaped_sample_name: string;
    sorting: [ `Coordinate | `Read_name ] option;
    reference_build: string;
  >
  let bam_file ~host ?name ?sorting ~reference_build path : bam_file =
    object (self)
      val file = single_file ~host path
      method host = host
      method sample_name =
        Option.value name ~default:(Filename.chop_extension (Filename.basename path))
      method escaped_sample_name =
        String.map self#sample_name ~f:(function
          | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' | '-' | '_' as c -> c
          | other -> '_')
      method path = file#path
      method is_done = file#is_done
      method sorting = sorting
      method reference_build = reference_build
    end

(** Make a new bam sharing most of the metadata. *)


  let transform_bam ?change_sorting (bam : bam_file) ~path : bam_file =
    bam_file
      ~host:bam#host
      ?sorting:(
        match change_sorting with
        | Some new_sorting -> Some new_sorting
        | None -> bam#sorting
      )
      ~reference_build:bam#reference_build
      path


  type bam_list = <
    is_done:  Ketrew_pure.Target.Condition.t option;
    bams: bam_file list;
  >
  let bam_list (bams : bam_file list) : bam_list =
    object
      method bams = bams
      method is_done =
        Some (
          `And (List.map bams
                  ~f:(fun b ->
                      b#is_done
                      |> Option.value_exn ~msg:"Bams should have a Condition.t"))
        )
    end

  let explode_bam_list_node (bln : bam_list workflow_node) =
    List.map bln#product#bams ~f:(fun bam ->
        workflow_node bam
          ~name:(Filename.basename bam#path)
          ~tags:["expolode_bam_list_node"]
          ~edges:[depends_on bln]
          ~equivalence:`None)

  (* this may be overkill: *)
  type _ bam_or_bams =
    | Single_bam: bam_file workflow_node -> bam_file workflow_node bam_or_bams
    | Bam_workflow_list: bam_file workflow_node list -> bam_list workflow_node bam_or_bams

  type vcf_file = <
    is_done: Ketrew_pure.Target.Condition.t option;
    host: Host.t;
    path : string;
    reference_build: string;
    as_single_file: single_file product;
  >
  let vcf_file ~host ~reference_build path : vcf_file =
    object (self)
      val file = single_file ~host path
      method host = host
      method path = file#path
      method is_done = file#is_done
      method reference_build = reference_build
      method as_single_file = file
    end
  let transform_vcf vcf ~path =
    vcf_file ~host:vcf#host ~reference_build:vcf#reference_build path

  let submit w = Ketrew.Client.submit_workflow w

end

(** An attempt at standardizing “tags.” *)


module Target_tags = struct
  let aligner = "aligner"
  let variant_caller = "variant-caller"
  let clean_up = "clean-up"
end
end
module Hla_utilities 
= struct
(* Hassle-free HLA/MHC handling *)
open Common

type predictor_type = [
  | `NetMHC
  | `NetMHCpan
  | `NetMHCIIpan
  | `NetMHCcons
  | `Random
  | `SMM
  | `SMM_PMBEC
  | `NetMHCpan_IEDB
  | `NetMHCcons_IEDB
  | `SMM_IEDB
  | `SMM_PMBEC_IEDB
]

let predictor_to_string = function
  | `NetMHC -> "netmhc"
  | `NetMHCpan -> "netmhcpan"
  | `NetMHCIIpan -> "netmhciipan"
  | `NetMHCcons -> "netmhccons"
  | `Random -> "random"
  | `SMM -> "smm"
  | `SMM_PMBEC -> "smm-pmbec"
  | `NetMHCpan_IEDB -> "netmhcpan-iedb"
  | `NetMHCcons_IEDB -> "netmhccons-iedb"
  | `SMM_IEDB -> "smm-iedb"
  | `SMM_PMBEC_IEDB -> "smm-pmbec-iedb"

let predictor_to_tool ~run_with predictor =
  let get_tool t =
    let tool =
      Machine.get_tool
        run_with
        Machine.Tool.Definition.(create t)
    in
    let ensure = Machine.Tool.(ensure tool) in
    let init = Machine.Tool.(init tool) in
    (ensure, init)
  in
  match predictor with
  | `NetMHC -> Some (get_tool "netMHC")
  | `NetMHCpan -> Some (get_tool "netMHCpan")
  | `NetMHCIIpan -> Some (get_tool "netMHCIIpan")
  | `NetMHCcons -> Some (get_tool "netMHCcons")
  | _ -> None


(* 
  Example input (all in):
    1,A*03:01,,0.026478,samplename
    1,B*37:01',,0.000000,samplename
    1, C*06:02,,0.000086,samplename
    2,DQA1*01:02 ,,0.000000,samplename
  Example output (type-I predictions out as a plain list):
    A*03:01
    B*37:01
    C*06:02
    C*07:02

  So the following "one-liner"
    - extracts the second column
    - trims the allele names from white-spaces around them
    - gets rid of 's at the end of the allele names
    - removes type-II predictions (since we don't support
      type-II predictions)
  to be able to feed the file into `mhctools` based utilities,
  including topiary, vaxrank, and netmhctools itself.
*)

let sanitize_hlarp_out_for_mhctools ~run_with ~hlarp_result ~output_path = 
  let open KEDSL in
  let input_path = hlarp_result#product#path in
  let name = 
    sprintf
      "Extract and sanitize alleles: %s"
      (hlarp_result#render#name)
  in
  let edges = [ depends_on hlarp_result; ] in
  let product = single_file ~host:(Machine.as_host run_with) output_path in
  let tmp_path = output_path ^ ".tmp" in
  let make = Machine.(
    run_program run_with
      ~requirements:[ `Quick_run; ]
      Program.(
        shf "cat %s | grep -v '^2' | awk -F , '{ gsub(/^[ \t]+|[ \t]+$/,\"\", $2); print $2}' | tail -n +2 | sed \"s/'//\" > %s && mv %s %s"
           input_path tmp_path tmp_path output_path
      )
    )
  in
  workflow_node product ~name ~make ~edges 
end
module Machine 
= struct

(**************************************************************************)


(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *)
(*                                                                        *)
(*  Licensed under the Apache License, Version 2.0 (the "License");       *)
(*  you may not use this file except in compliance with the License.      *)
(*  You may obtain a copy of the License at                               *)
(*                                                                        *)
(*      http://www.apache.org/licenses/LICENSE-2.0                        *)
(*                                                                        *)
(*  Unless required by applicable law or agreed to in writing, software   *)
(*  distributed under the License is distributed on an "AS IS" BASIS,     *)
(*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *)
(*  implied.  See the License for the specific language governing         *)
(*  permissions and limitations under the License.                        *)

(**************************************************************************)




open Common

open KEDSL


module Tool = struct
  module Definition = struct
    type t = {name: string; version: string option}
    let create ?version name  = {name; version}
    let to_opam_name {name; version} =
      sprintf "%s.%s" name (Option.value ~default:"NOVERSION" version)
    let to_string = to_opam_name
    let to_directory_name = to_opam_name
    let get_version t = t.version
    let get_name t = t.name
  end
  module Default = struct
    open Definition
    let bwa = create "bwa" ~version:"0.7.10"
    let freebayes = create "freebayes" ~version:"1.1.0"
    let sambamba = create "sambamba" ~version:"0.6.5"
    let samtools = create "samtools" ~version:"1.4"
    let bcftools = create "bcftools" ~version:"1.4"
    let vcftools = create "vcftools" ~version:"0.1.12b"
    let bedtools = create "bedtools" ~version:"2.23.0"
    let somaticsniper = create "somaticsniper" ~version:"1.0.3"
    let varscan = create "varscan" ~version:"2.3.5"
    let mutect = create "mutect" (* We don't know the versions of the users' GATKs *)
    let gatk = create "gatk" (* idem, because of their non-open-source licenses *)
    let strelka = create "strelka" ~version:"1.0.14"
    let virmid = create "virmid" ~version:"1.1.1"
    let muse = create "muse" ~version:"1.0b"
    let star = create "star" ~version:"2.4.1d"
    let stringtie = create "stringtie" ~version:"1.2.2"
    let cufflinks = create "cufflinks" ~version:"2.2.1"
    let hisat = create "hisat" ~version:"0.1.6-beta"
    let hisat2 = create "hisat" ~version:"2.0.2-beta"
    let mosaik = create "mosaik" ~version:"2.2.3"
    let kallisto = create "kallisto" ~version:"0.42.3"
    let bowtie = create "bowtie" ~version:"1.1.2"
    let fastqc = create "fastqc" ~version:"0.11.5"
    let igvxml = create "igvxml" ~version:"0.1.0"
    let hlarp = create "hlarp" ~version:"biokepi-branch"
    let samblaster = create "samblaster" ~version:"v.0.1.22"
    let delly2 = create "delly2" ~version:"0.7.7"
    (* Bioconda *)
    let optitype = create "optitype" ~version:"1.2.1-0"
    let seqtk = create "seqtk" ~version:"1.2"
    let seq2hla = create "seq2hla" ~version:"2.2"
    let picard = create "picard" ~version:"2.9.2"
    let snpeff = create "snpeff" ~version:"4.3.1m-0"
    (* PyPI packages *)
    let pyensembl = create "pyensembl" ~version:"1.1.0"
    let vcfannotatepolyphen = create "vcf-annotate-polyphen" ~version:"0.1.2"
    let topiary = create "topiary" ~version:"1.2.1"
    let vaxrank = create "vaxrank" ~version:"0.6.0"
    let isovar = create "isovar" ~version:"0.7.0"
  end

  type t = {
    definition: Definition.t;
    init: Program.t;
    ensure: phony_workflow;
  }
  let create ?init ?ensure definition = {
    definition;
    init =
      Option.value init
        ~default:(Program.shf "echo 'Tool %s: default init'"
                    (Definition.to_string definition));
    ensure =
      Option.value_map
        ensure
        ~f:KEDSL.forget_product
        ~default:(workflow_node nothing
                    ~name:(sprintf "%s-ensured"
                             (Definition.to_string definition)));
  }
  let init t = t.init
  let ensure t = t.ensure

  module Kit = struct
    type tool = t
    type t = Definition.t -> tool option

    let concat : t list -> t =
      fun l ->
      fun def ->
        List.find_map l ~f:(fun kit -> kit def)

    let of_list l : t =
      fun def ->
        List.find l ~f:(fun {definition; _} -> definition = def)

    let get_exn t tool =
      match t tool with
      | Some s -> s
      | None ->
        failwithf "Toolkit cannot provide the tool %s"
          (Definition.to_string tool)
  end
end

(** Jobs in Biokepi ask the computing environment (defined below in Machine) for resources.

The implementation of the Make_fun.t function defined by the user is free to interpret those requirements according to the user's computing infrastructure. *)


module Make_fun = struct
  module Requirement = struct
    type t = [
      | `Processors of int

(** A number of cores on a shared-memory setting. *)


      | `Internet_access

(** Able to access public HTTP(S) or FTP URLs. *)


      | `Memory of [
          | `GB of float

(** Ask for a specific amount of memory. *)


          | `Small

(** Tell that the program does not expect HPC-like memory usage (i.e. not more than 2 GB or your usual laptop). *)


          | `Big

(** Tell that the program may ask for a lot of memory but you don't know how much precisely. *)


        ]
      | `Quick_run

(** Programs that run fast, with little resources. Usually, you can interpret this as "OK to run on the login node of my cluster." *)


      | `Spark of string list

(** Ask for a Spark (on-YARN) environment with custom parameters (not in use for now, "#WIP"). *)


      | `Custom of string

(** Pass arbitrary data (useful for temporary extensions/experiements outside of Biokepi). *)


      | `Self_identification of string list

(** Set of names or tags for a workflow-node program to identify itself to the Machine.t. This is useful for quickly bypassing incorrect requirements set in the library (please also report an issue if you need this). *)


    ] [@@deriving yojson, show]
  end

  type t =
    ?name: string ->
    ?requirements: Requirement.t list ->
    Program.t ->
    KEDSL.Build_process.t

(** The type of the “run function” used across the library. *)

(** A stream processor, for this purpose, is a program that runs on one core and does not grow in memory arbitrarily. *)


  let stream_processor requirements =
    `Processors 1 :: `Memory `Small :: requirements

  let quick requirements = `Quick_run :: requirements

  let downloading requirements =
    `Internet_access :: stream_processor requirements

  let with_self_ids ?self_ids l =
    match self_ids with
    | Some tags -> `Self_identification tags :: l
    | None -> l

  let with_requirements : t -> Requirement.t list -> t = fun f l ->
    fun ?name ?(requirements = []) prog ->
      f ?name ~requirements:(l @ requirements) prog
end

type t = {
  name: string;
  host: Host.t;
  pyensembl_cache_dir: string option;
  get_reference_genome: string -> Reference_genome.t;
  toolkit: Tool.Kit.t;
  run_program: Make_fun.t;
  work_dir: string;
  max_processors: int;
}
let create
    ~host ?pyensembl_cache_dir ~get_reference_genome ~toolkit
    ~run_program ~work_dir ~max_processors  name =
  {name; toolkit; pyensembl_cache_dir; get_reference_genome;
   host; run_program; work_dir; max_processors}

let name t = t.name

let as_host ?with_shell t =
  match with_shell with
  | None -> t.host
  | Some shell ->
    begin
      let open Ketrew_pure in
      let shell_key = "shell" in
      let org_uri = Host.to_uri t.host in
      let uri_no_shell = Uri.remove_query_param org_uri shell_key in
      let uri_with_shell =
        let shell_str = sprintf "%s,-c" shell in (* as in `bash -c` *)
        Uri.add_query_param uri_no_shell (shell_key, [shell_str;])
      in
      KEDSL.Host.parse (Uri.to_string uri_with_shell)
    end

let get_pyensembl_cache_dir t = t.pyensembl_cache_dir
let get_reference_genome t = t.get_reference_genome
let get_tool t tool =
  match t.toolkit tool with
  | Some s -> s
  | None ->
    failwithf "Machine %S cannot provide the tool %s"
      t.name (Tool.Definition.to_string tool)

let run_program t = t.run_program

let max_processors t = t.max_processors

(** Get the maximum number of processors that a single job can use in the Machine.t (i.e. usually the “number-of-threads” paramters of most tools) *)



let quick_run_program t : Make_fun.t =
  Make_fun.with_requirements t.run_program (Make_fun.quick [])

(** Run a program that does not use much memory and runs on one core. *)


let run_stream_processor ?self_ids t : Make_fun.t =
  Make_fun.with_requirements t.run_program
    (Make_fun.stream_processor [] |> Make_fun.with_self_ids ?self_ids)

(** Run a program that does not use much memory, runs on one core, and needs the internet. *)


let run_download_program t : Make_fun.t =
  Make_fun.with_requirements t.run_program (Make_fun.downloading [])

let run_big_program t :
  ?processors: int -> ?self_ids : string list -> Make_fun.t =
  fun ?(processors = 1) ?self_ids ->
    Make_fun.with_requirements
      t.run_program
      (Make_fun.with_self_ids ?self_ids [`Memory `Big; `Processors processors])

let work_dir t = t.work_dir
end
module Metadata 
= struct

(** Metadata Module Generated by the Build System *)

(** Official version string of the current build *)


let version = lazy "0.0.0+dev"

(** Current Git commit (if avaiable at build-time) *)


let git_commit = Some "c07982e85f581fc4972b5a6f8d601649e56a4f34"

(** Current result of "git describe" (if avaiable at build-time) *)


let git_description = Some "biokepi.0.0.0-794-gc07982e"
end
module Reference_genome 
: sig

(**************************************************************************)


(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *)
(*                                                                        *)
(*  Licensed under the Apache License, Version 2.0 (the "License");       *)
(*  you may not use this file except in compliance with the License.      *)
(*  You may obtain a copy of the License at                               *)
(*                                                                        *)
(*      http://www.apache.org/licenses/LICENSE-2.0                        *)
(*                                                                        *)
(*  Unless required by applicable law or agreed to in writing, software   *)
(*  distributed under the License is distributed on an "AS IS" BASIS,     *)
(*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *)
(*  implied.  See the License for the specific language governing         *)
(*  permissions and limitations under the License.                        *)

(**************************************************************************)

(** Representation of Reference Genomes *)




open Common

type name = string

module Specification : sig
  module Location : sig
    type t = [
      | `Url of string
      | `Vcf_concat of (string * t) list (* name × location *)
      | `Concat of t list
      | `Gunzip of t (* Should this be guessed from the URL extension? *)
      | `Bunzip2 of t
      | `Untar of t
    ]
    val url : 'a -> [> `Url of 'a ]
    val vcf_concat : 'a -> [> `Vcf_concat of 'a ]
    val concat : 'a -> [> `Concat of 'a ]
    val gunzip : 'a -> [> `Gunzip of 'a ]
    val bunzip2 : 'a -> [> `Bunzip2 of 'a ]
    val untar : 'a -> [> `Untar of 'a ]
  end
  type t = private {
    name : name;
    ensembl : int;
    species : string;
    metadata : string option;
    fasta : Location.t;
    dbsnp : Location.t option;
    known_indels : Location.t option;
    cosmic : Location.t option;
    exome_gtf : Location.t option;
    cdna : Location.t option;
    whess : Location.t option;
    major_contigs : string list option;
    snpeff_name : string option;
  }
  val create :
    ?metadata:string ->
    fasta:Location.t ->
    ensembl:int ->
    species:string ->
    ?dbsnp:Location.t ->
    ?known_indels:Location.t ->
    ?cosmic:Location.t ->
    ?exome_gtf:Location.t ->
    ?cdna:Location.t ->
    ?whess:Location.t ->
    ?major_contigs:string list ->
    ?snpeff_name:string ->
    string ->
    t
  module Default :
  sig
    module Name : sig

(** The “names” of the default genomes; the values are provided to simplify code and make it less typo-error-prone but the string can be ipused directly (e.g. b37 is just "b37"). *)


      val b37 : name
      val b37decoy : name
      val b38 : name
      val hg38: name
      val hg18 : name
      val hg19 : name
      val mm10 : name
    end
    val b37 : t
    val b37decoy : t
    val b38 : t
    val hg38 : t
    val hg18 : t
    val hg19 : t
    val mm10 : t
  end
end

type t = private {
  specification: Specification.t;
  location : KEDSL.file_workflow;
  cosmic :  KEDSL.file_workflow option;
  dbsnp :  KEDSL.file_workflow option;
  known_indels : KEDSL.file_workflow option;
  gtf : KEDSL.file_workflow option;
  cdna : KEDSL.file_workflow option;
  whess : KEDSL.file_workflow option;
}

(** A reference genome has a name (for display/matching) and a cluster-dependent path. Corresponding Cosmic and dbSNP databases (VCFs) can be added to the mix. *)




val create :
  ?cosmic:KEDSL.file_workflow ->
  ?dbsnp:KEDSL.file_workflow ->
  ?known_indels:KEDSL.file_workflow ->
  ?gtf:KEDSL.file_workflow ->
  ?cdna:KEDSL.file_workflow ->
  ?whess:KEDSL.file_workflow ->
  Specification.t -> KEDSL.file_workflow -> t

(** Build a Reference_genome.t record. *)

(**

Usual Accessors



val name : t -> name
val ensembl : t -> int
val species : t -> string
val path : t -> string
val cosmic_path_exn : t -> string
val dbsnp_path_exn : t -> string
val known_indels_path_exn : t -> string
val gtf_path_exn : t -> string
val cdna_path_exn : t -> string
val whess_path_exn : t -> string
val snpeff_name_exn: t -> string

val major_contigs : t -> Region.t list

(**

Targets



val fasta: t -> KEDSL.file_workflow
val cosmic_exn: t -> KEDSL.file_workflow
val dbsnp_exn: t -> KEDSL.file_workflow
val known_indels_exn: t -> KEDSL.file_workflow
val gtf_exn: t -> KEDSL.file_workflow
val gtf: t -> KEDSL.file_workflow option
val cdna_exn: t -> KEDSL.file_workflow
val whess_exn: t -> KEDSL.file_workflow
end 
= struct
open Common

type name = string

module Specification = struct
  module Location = struct
    type t = [
      | `Url of string
      | `Vcf_concat of (string * t) list (* name × location *)
      | `Concat of t list
      | `Gunzip of t (* Should this be guessed from the URL extension? *)
      | `Bunzip2 of t
      | `Untar of t
    ]
    let url u = `Url u
    let vcf_concat l = `Vcf_concat l
    let concat l = `Concat l
    let gunzip l = `Gunzip l
    let bunzip2 l = `Bunzip2 l
    let untar l = `Untar l
  end

  type t = {
    name: string;
    ensembl: int;
    species: string;
    metadata: string option;
    fasta: Location.t;
    dbsnp: Location.t option;
    known_indels: Location.t option;
    cosmic: Location.t option;
    exome_gtf: Location.t option; (* maybe desrves a better name? *)
    cdna: Location.t option;
    whess: Location.t option;
    major_contigs: string list option;
    snpeff_name: string option;
  }

  let create
      ?metadata
      ~fasta
      ~ensembl
      ~species
      ?dbsnp
      ?known_indels
      ?cosmic
      ?exome_gtf
      ?cdna
      ?whess
      ?major_contigs
      ?snpeff_name
      name = {
    name;
    ensembl;
    species;
    metadata;
    fasta;
    dbsnp;
    known_indels;
    cosmic;
    exome_gtf;
    cdna;
    whess;
    major_contigs;
    snpeff_name;
  }

module Default = struct

  let major_contigs_b37 =
    List.init 22 (fun i -> sprintf "%d" (i + 1))
    @ ["X"; "Y"; "MT";]
  let major_contigs_hg_family =
    List.init 22 (fun i -> sprintf "chr%d" (i + 1))
    @ [
      "chrX";
      "chrY";
      "chrM";
    ]
  let major_contigs_mm10 =
    List.init 19 (fun i -> sprintf "%d" (i + 1))
    @ [ "X"; "Y" ]

  module Name = struct
    let b37 = "b37"
    let b37decoy = "b37decoy"
    let b38 = "b38"
    let hg38 = "hg38"
    let hg18 = "hg18"
    let hg19 = "hg19"
    let mm10 = "mm10"
  end


  (* Used by both B37 and B37decoy *)
  let b37_dbsnp_url =
    "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/dbsnp_138.b37.vcf.gz"
  let b37_cosmic_url =
    "http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/b37_cosmic_v54_120711.vcf"
  let b37_exome_gtf_url =
    "http://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz"
  let b37_cdna_url =
    "http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
  let b37_whess_url =
    "ftp://genetics.bwh.harvard.edu/pph2/whess/polyphen-2.2.2-whess-2011_12.sqlite.bz2"
  let b37_known_indels_url =
    "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/Mills_and_1000G_gold_standard.indels.b37.vcf.gz"

  let human = "homo sapiens"
  let mouse = "mus musculus"

  let b37 =
    create Name.b37
      ~species:human
      ~ensembl:75
      ~metadata:"Provided by the Biokepi library"
      ~major_contigs:major_contigs_b37
      ~fasta:Location.(
          url "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/human_g1k_v37.fasta.gz"
          |> gunzip)
      ~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
      (* Alternate?
         "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/v4.0/00-All.vcf.gz"
      *)
      ~known_indels:Location.(url b37_known_indels_url |> gunzip)
      ~cosmic:Location.(url b37_cosmic_url)
      ~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
      ~cdna:Location.(url b37_cdna_url |> gunzip)
      ~whess:Location.(url b37_whess_url |> bunzip2)
      ~snpeff_name:"GRCh37.75"

  let b37decoy =
    create Name.b37decoy
      ~species:human
      ~ensembl:75
      ~metadata:"Provided by the Biokepi library"
      ~major_contigs:major_contigs_b37
      ~fasta:Location.(
          url
            "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
          |> gunzip)
      ~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
      ~known_indels:Location.(url b37_known_indels_url |> gunzip)
      ~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
      ~cosmic:Location.(url b37_cosmic_url)
      ~cdna:Location.(url b37_cdna_url |> gunzip)
      ~whess:Location.(url b37_whess_url |> bunzip2)
      ~snpeff_name:"GRCh37.75"

  let hg38 =
    (* Release 87 *)
    let hg38_url =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.fasta.gz" in
    let dbsnp_hg38 =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.dbsnp.vcf.gz" in
    let known_indels_hg38 =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.known_indels.vcf.gz" in
    create Name.hg38
      ~species:human
      ~ensembl:87
      ~metadata:"Provided by the Biokepi library"
      ~major_contigs:major_contigs_hg_family
      ~fasta:Location.(url hg38_url|> gunzip)
      ~dbsnp:Location.(url dbsnp_hg38 |> gunzip)
      ~known_indels:Location.(url known_indels_hg38 |> gunzip)
      ~snpeff_name:"GRCh38.86"

  let b38 =
    (* Release 87 *)
    let b38_url =
      "http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" in
    let gtf_b38_url =
      "http://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz" in
    let cdna_b38_url =
      "http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" in
    let dbsnp_url =
      "http://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/common_all_20170710.vcf.gz" in
    create Name.b38
      ~species:human
      ~ensembl:87
      ~metadata:"Provided by the Biokepi library"
      ~major_contigs:major_contigs_b37
      ~fasta:Location.(url b38_url |> gunzip)
      ~exome_gtf:Location.(url gtf_b38_url |> gunzip)
      ~dbsnp:Location.(url dbsnp_url |> gunzip)
      ~cdna:Location.(url cdna_b38_url |> gunzip)
      ~snpeff_name:"GRCh38.86"

  let hg18 =
    let hg18_url =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/Homo_sapiens_assembly18.fasta.gz" in
    let dbsnp_hg18_url =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/dbsnp_138.hg18.vcf.gz" in
    create Name.hg18
      ~ensembl:54
      ~species:human
      ~metadata:"Provided by the Biokepi library"
      ~major_contigs:major_contigs_hg_family
      ~fasta:Location.(url hg18_url|> gunzip)
      ~dbsnp:Location.(url dbsnp_hg18_url |> gunzip)

  let hg19 =
    let hg19_url =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/ucsc.hg19.fasta.gz" in
    let dbsnp_hg19_url =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/dbsnp_138.hg19.vcf.gz" in
    let known_indels_hg19_url =
      "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" in
    create Name.hg19
      ~ensembl:75
      ~species:human
      ~metadata:"Provided by the Biokepi library"
      ~major_contigs:major_contigs_hg_family
      ~fasta:Location.(url hg19_url|> gunzip)
      ~dbsnp:Location.(url dbsnp_hg19_url |> gunzip)
      ~known_indels:Location.(url known_indels_hg19_url |> gunzip)
      ~whess:Location.(url b37_whess_url |> bunzip2)
      ~snpeff_name:"hg19"

  let mm10 =
    let mm10_url =
      "https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/mm10.GRCm38.dna_sm.fa" in
    let dbsnp_mm10_snps_url =
      "ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.snps.rsIDdbSNPv137.vcf.gz" in
    let dbsnp_mm10_indels_url =
      "ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.indels.rsIDdbSNPv137.vcf.gz" in
    let gene_annotations_gtf =
      "ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz" in
    let cdna_mm10_url =
      "ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" in
    create Name.mm10
      ~ensembl:87
      ~species:mouse
      ~metadata:"Provided by the Biokepi Library"
      ~major_contigs:major_contigs_mm10
      ~fasta:Location.(url mm10_url |> gunzip)
      ~dbsnp:Location.(
          vcf_concat ["db_snps.vcf", url dbsnp_mm10_snps_url |> gunzip;
                      "db_indels.vcf", url dbsnp_mm10_indels_url |> gunzip]
        )
      ~exome_gtf:Location.(url gene_annotations_gtf |> gunzip)
      ~cdna:Location.(url cdna_mm10_url |> gunzip)
      ~snpeff_name:"mm10"

end

end

(** A reference genome has a name (for display/matching) and a cluster-dependent path. Corresponding Cosmic and dbSNP databases (VCFs) can be added to the mix. *)


type t = {
  specification: Specification.t;
  location: KEDSL.file_workflow;
  cosmic:  KEDSL.file_workflow option;
  dbsnp:  KEDSL.file_workflow option;
  known_indels:  KEDSL.file_workflow option;
  gtf:  KEDSL.file_workflow option;
  cdna: KEDSL.file_workflow option;
  whess: KEDSL.file_workflow option;
}

let create ?cosmic ?dbsnp ?known_indels ?gtf ?cdna ?whess specification location =
  {specification; location; cosmic; dbsnp; known_indels; gtf; cdna; whess}

let name t = t.specification.Specification.name
let ensembl t = t.specification.Specification.ensembl
let species t = t.specification.Specification.species
let snpeff_name_exn t = 
  Option.value_exn 
    ~msg:(sprintf "%s: no snpEff name" (name t))
    t.specification.Specification.snpeff_name
let path t = t.location#product#path
let cosmic_path_exn t =
  let msg = sprintf "cosmic_path_exn of %s" (name t) in
  let cosmic = Option.value_exn ~msg t.cosmic in
  cosmic#product#path

let dbsnp_path_exn t =
  let msg = sprintf "dbsnp_path_exn of %s" (name t) in
  let trgt = Option.value_exn ~msg t.dbsnp in
  trgt#product#path

let known_indels_path_exn t =
  let msg = sprintf "known_indels_path_exn of %s" (name t) in
  let trgt = Option.value_exn ~msg t.known_indels in
  trgt#product#path

let gtf_path_exn t =
  let msg = sprintf "gtf_path_exn of %s" (name t) in
  let trgt = Option.value_exn ~msg t.gtf in
  trgt#product#path

let cdna_path_exn t =
    let msg = sprintf "cdna_path_exn of %s" (name t) in
    let target = Option.value_exn ~msg t.cdna in
    target#product#path

let whess_path_exn t =
    let msg = sprintf "whess_path_exn of %s" (name t) in
    let target = Option.value_exn ~msg t.whess in
    target#product#path

let fasta: t -> KEDSL.file_workflow = fun t -> t.location
let cosmic_exn t =
  Option.value_exn ~msg:(sprintf "%s: no COSMIC" (name t)) t.cosmic
let dbsnp_exn t =
  Option.value_exn ~msg:(sprintf "%s: no DBSNP" (name t)) t.dbsnp

let known_indels_exn t =
  Option.value_exn ~msg:(sprintf "%s: no Known Indels" (name t)) t.known_indels

let gtf_exn t =
  Option.value_exn ~msg:(sprintf "%s: no GTF" (name t)) t.gtf
let gtf t = t.gtf
let cdna_exn t =
  Option.value_exn ~msg:(sprintf "%s: no cDNA fasta file" (name t)) t.cdna
let whess_exn t =
  Option.value_exn ~msg:(sprintf "%s: no WHESS file" (name t)) t.whess

let major_contigs t : Region.t list =
  match t.specification.Specification.major_contigs with
  | None ->
    failwithf "Reference %S does have major-contigs/chromosomes defined" (name t)
  | Some l -> List.map l ~f:(fun s -> `Chromosome s)
end
module Region 
= struct

(**************************************************************************)


(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *)
(*                                                                        *)
(*  Licensed under the Apache License, Version 2.0 (the "License");       *)
(*  you may not use this file except in compliance with the License.      *)
(*  You may obtain a copy of the License at                               *)
(*                                                                        *)
(*      http://www.apache.org/licenses/LICENSE-2.0                        *)
(*                                                                        *)
(*  Unless required by applicable law or agreed to in writing, software   *)
(*  distributed under the License is distributed on an "AS IS" BASIS,     *)
(*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *)
(*  implied.  See the License for the specific language governing         *)
(*  permissions and limitations under the License.                        *)

(**************************************************************************)




open Common

(** Positions are 1-based *)


type t = [
  | `Chromosome of string
  | `Chromosome_interval of string * int * int
  | `Full
]

(** Make a filename-compliant string out of a region specification. *)


let to_filename = function
| `Full -> "Full"
| `Chromosome s -> sprintf "%s" s
| `Chromosome_interval (s, b, e) -> sprintf "%s_%d-%d" s b e

let to_samtools_specification = function
| `Full -> None
| `Chromosome s -> Some s
| `Chromosome_interval (s, b, e) -> Some (sprintf "%s:%d-%d" s b e)

let to_samtools_option r =
  match to_samtools_specification r with
  | Some s -> sprintf "-r %s" s
  | None -> ""

let to_gatk_option r =
  match to_samtools_specification r with
  | Some s -> sprintf "--intervals %s" s
  | None -> ""

let parse_samtools s =
  match String.split ~on:(`Character ':') s with
  | [] -> assert false
  | [one] -> `Chromosome one
  | [one; two] ->
    begin match String.split ~on:(`Character '-') two with
    | [left; right] ->
      begin match Int.of_string left, Int.of_string right with
      | Some b, Some e -> `Chromosome_interval (one, b, e)
      | _ -> failwithf "Cannot parse %S into 2 loci" two
      end
    | _ -> failwithf "Not one '-' in %S" two
    end
  | _ -> failwithf "Not one or zero ':' in %S" s


let cmdliner_term () =
  let open Cmdliner in
  Term.(
    pure (function
      | None -> `Full
      | Some s -> parse_samtools s)
    $ Arg.(
        value & opt (some string) None
        & info ["R"; "region"] ~docv:"REGION"
          ~doc:"Specify a region; using samtools' format"
      )
  )
end
module Tool_parameters 
= struct
open Common


type t = {
  name: string;
  parameters: (string * string) list;
}

let to_json t: Yojson.Basic.json =
  let {name; parameters} = t in
  `Assoc [
    "name", `String name;
    "parameters",
    `Assoc (List.map parameters ~f:(fun (a, b) -> a, `String b));
  ]

let render {parameters; _} =
  List.concat_map parameters ~f:(fun (a,b) -> [a; b])
end
module Workflow_utilities 
= struct

(** Small/useful workflow-nodes. *)


open Common


module Remove = struct
  let file ~run_with path =
    let open KEDSL in
    workflow_node nothing
      ~name:(sprintf "rm-%s" (Filename.basename path))
      ~ensures:(`Is_verified (`Command_returns (
          Command.shell ~host:Machine.(as_host run_with)
            (sprintf "ls %s" path),
          2)))
      ~make:(Machine.quick_run_program
               run_with Program.(exec ["rm"; "-f"; path]))
      ~tags:[Target_tags.clean_up]

  let directory ~run_with path =
    let open KEDSL in
    workflow_node nothing
      ~name:(sprintf "rmdir-%s" (Filename.basename path))
      ~ensures:(`Is_verified (`Command_returns (
          Command.shell ~host:Machine.(as_host run_with)
            (sprintf "ls %s" path),
          2
        )))
      ~make:(Machine.quick_run_program
               run_with Program.(exec ["rm"; "-rf"; path]))
      ~tags:[Target_tags.clean_up]

  (* This one is dirtier, it does not check its result and uses the `Host.t`
     directly, it should be used only when the `Machine.t` is not available
     (i.e. while defining a `Machine.t`). *)
  let path_on_host ~host path =
    let open KEDSL in
    workflow_node nothing
      ~name:(sprintf "rm-%s" (Filename.basename path))
      ~make:(daemonize ~using:`Python_daemon ~host
               Program.(exec ["rm"; "-rf"; path]))
end


module Gunzip = struct

(** Example: call "gunzip <list of fastq.gz files> > some_name_cat.fastq". *)


  let concat ~(run_with : Machine.t) bunch_of_dot_gzs ~result_path =
    let open KEDSL in
    let program =
      Program.(
        exec ["mkdir"; "-p"; Filename.dirname result_path]
        && shf "gunzip -c  %s > %s"
          (List.map bunch_of_dot_gzs
             ~f:(fun o -> Filename.quote o#product#path)
           |> String.concat ~sep:" ") result_path
      ) in
    let name =
      sprintf "gunzipcat-%s" (Filename.basename result_path) in
    workflow_node
      (single_file result_path ~host:Machine.(as_host run_with))
      ~name
      ~make:(Machine.run_stream_processor ~name run_with  program)
      ~edges:(
        on_failure_activate Remove.(file ~run_with result_path)
        :: List.map ~f:depends_on bunch_of_dot_gzs)
end


module Cat = struct
  let concat ~(run_with : Machine.t) bunch_of_files ~result_path =
    let open KEDSL in
    let program =
      Program.(
        exec ["mkdir"; "-p"; Filename.dirname result_path]
        && shf "cat %s > %s"
          (List.map bunch_of_files
             ~f:(fun o -> Filename.quote o#product#path)
           |> String.concat ~sep:" ") result_path
      ) in
    let name =
      sprintf "concat-all-%s" (Filename.basename result_path) in
    workflow_node
      (single_file result_path ~host:Machine.(as_host run_with))
      ~name
      ~edges:(
        on_failure_activate Remove.(file ~run_with result_path)
        :: List.map ~f:depends_on bunch_of_files)
      ~make:(Machine.run_stream_processor run_with ~name  program)

  let cat_folder ~host
      ~(run_program : Machine.Make_fun.t)
      ?(depends_on=[]) ~files_gzipped ~folder ~destination = 
    let deps = depends_on in
    let open KEDSL in
    let name = "cat-folder-" ^ Filename.quote folder in
    let edges =
      on_failure_activate (Remove.path_on_host ~host destination)
      :: List.map ~f:depends_on deps in
    if files_gzipped then (
      workflow_node (single_file destination ~host)
        ~edges ~name
        ~make:(
          run_program ~name
            Program.(
              shf "gunzip -c %s/* > %s" (Filename.quote folder)
                (Filename.quote destination)))
    ) else (
      workflow_node
        (single_file destination ~host)
        ~edges ~name
        ~make:(
          run_program ~name
            Program.(
              shf "cat %s/* > %s" (Filename.quote folder) (Filename.quote destination)))
    )

end

module Download = struct

  let wget_program ?output_filename url =
    KEDSL.Program.exec [
      "wget";
      "-O"; Option.value output_filename ~default:Filename.(basename url);
      url
    ]

  let wget_to_folder
      ~host ~(run_program : Machine.Make_fun.t)
      ~test_file ~destination url  =
    let open KEDSL in
    let name = "wget-" ^ Filename.basename destination in
    let test_target = destination // test_file in
    workflow_node (single_file test_target ~host) ~name
      ~make:(
        run_program ~name
          ~requirements:(Machine.Make_fun.downloading [])
          Program.(
            exec ["mkdir"; "-p"; destination]
            && shf "wget %s -P %s"
              (Filename.quote url)
              (Filename.quote destination)))
      ~edges:[
        on_failure_activate (Remove.path_on_host ~host destination);
      ]

  let wget
      ~host ~(run_program : Machine.Make_fun.t)
      url destination =
    let open KEDSL in
    let name = "wget-" ^ Filename.basename destination in
    workflow_node
      (single_file destination ~host) ~name
      ~make:(
        run_program ~name
          ~requirements:(Machine.Make_fun.downloading [])
          Program.(
            exec ["mkdir"; "-p"; Filename.dirname destination]
            && shf "wget %s -O %s"
              (Filename.quote url) (Filename.quote destination)))
      ~edges:[
        on_failure_activate (Remove.path_on_host ~host destination);
      ]

  let wget_gunzip
      ~host ~(run_program : Machine.Make_fun.t)
      ~destination url =
    let open KEDSL in
    let is_gz = Filename.check_suffix url ".gz" in
    if is_gz then (
      let name = "gunzip-" ^ Filename.basename (destination ^ ".gz") in
      let wgot = wget ~host ~run_program url (destination ^ ".gz") in
      workflow_node
        (single_file destination ~host)
        ~edges:[
          depends_on (wgot);
          on_failure_activate (Remove.path_on_host ~host destination);
        ]
        ~name
        ~make:(
          run_program ~name
            ~requirements:(Machine.Make_fun.stream_processor [])
            Program.(shf "gunzip -c %s > %s"
                       (Filename.quote wgot#product#path)
                       (Filename.quote destination)))
    ) else (
      wget ~host ~run_program url destination
    )

  let wget_bunzip2
      ~host ~(run_program : Machine.Make_fun.t)
      ~destination url =
    let open KEDSL in
    let is_bz2 = Filename.check_suffix url ".bz2" in
    if is_bz2 then (
      let name = "bunzip2-" ^ Filename.basename (destination ^ ".bz2") in
      let wgot = wget ~host ~run_program url (destination ^ ".bz2") in
      workflow_node
        (single_file destination ~host)
        ~edges:[
          depends_on (wgot);
          on_failure_activate (Remove.path_on_host ~host destination);
        ]
        ~name
        ~make:(
          run_program ~name
            ~requirements:(Machine.Make_fun.stream_processor [])
            Program.(shf "bunzip2 -c %s > %s"
                       (Filename.quote wgot#product#path)
                       (Filename.quote destination)))
    ) else (
      wget ~host ~run_program url destination
    )

  let wget_untar
      ~host ~(run_program : Machine.Make_fun.t)
      ~destination_folder ~tar_contains url =
    let open KEDSL in
    let zip_flags =
      let is_gz = Filename.check_suffix url ".gz" in
      let is_bzip = Filename.check_suffix url ".bz2" in
      if is_gz then "z" else if is_bzip then "j" else ""
    in
    let tar_filename = (destination_folder // "archive.tar") in
    let name = "untar-" ^ tar_filename in
    let wgot = wget ~host ~run_program url tar_filename in
    let file_in_tar = (destination_folder // tar_contains) in
    workflow_node
      (single_file file_in_tar ~host)
      ~edges:[
        depends_on (wgot);
        on_failure_activate (Remove.path_on_host ~host destination_folder);
      ]
      ~name
      ~make:(
        run_program ~name
          ~requirements:(Machine.Make_fun.stream_processor [])
          Program.(
            exec ["mkdir"; "-p"; destination_folder]
            && shf "tar -x%s -f %s -C %s"
              zip_flags
              (Filename.quote wgot#product#path)
              (Filename.quote destination_folder)))
      
  type tool_file_location = [
    | `Scp of string
    | `Wget of string
    | `Fail of string
  ]

  let get_tool_file
      ~identifier
      ~(run_program : Machine.Make_fun.t)
      ~host ~install_path
      loc =
    let open KEDSL in
    let rm_path = Remove.path_on_host in
    let jar_name =
      match loc with
      | `Fail s -> sprintf "cannot-get-%s.file" identifier
      | `Scp s -> Filename.basename s
      | `Wget s -> Filename.basename s in
    let local_box_path = install_path // jar_name in
    workflow_node (single_file local_box_path ~host)
      ~name:(sprintf "get-%s" jar_name)
      ~edges:[
        on_failure_activate (rm_path ~host local_box_path)
      ]
      ~make:(
        run_program
          ~requirements:[
            `Internet_access;
            `Self_identification [identifier ^ "-instalation"; jar_name];
          ]
          Program.(
            shf "mkdir -p %s" install_path
            && begin match loc with
            | `Fail msg ->
              shf "echo 'Cannot download file for %s: %s'" identifier msg
              && sh "exit 4"
            | `Scp s ->
              shf "scp %s %s"
                (Filename.quote s) (Filename.quote local_box_path)
            | `Wget s ->
              shf "wget %s -O %s"
                (Filename.quote s) (Filename.quote local_box_path)
            end))

  let gsutil_cp
      ~(run_program : Machine.Make_fun.t)
      ~host ~url ~local_path =
    let open KEDSL in
    workflow_node (single_file ~host local_path)
      ~name:(sprintf "GSUtil-CP: %s" (Filename.basename local_path))
      ~edges:[
        on_failure_activate (Remove.path_on_host ~host local_path)
      ]
      ~make:(
        run_program
          ~requirements:[
            `Internet_access;
            `Self_identification ["gsutil-cp"; url];
          ]
          Program.(
            shf "mkdir -p %s" (Filename.dirname local_path)
            && exec ["gsutil"; "cp"; url; local_path]
          )
      )
end

module Vcftools = struct

(** Call a command on a list of ~vcfs to produce a given ~final_vcf (hence the n-to-1 naming). *)


  let vcf_process_n_to_1_no_machine
      ~host
      ~vcftools
      ~(run_program : Machine.Make_fun.t)
      ?(more_edges = [])
      ~vcfs
      ~make_product
      ~final_vcf
      command_prefix
    =
    let open KEDSL in
    let name = sprintf "%s-%s" command_prefix (Filename.basename final_vcf) in
    let make =
      run_program ~name
        Program.(
          Machine.Tool.(init vcftools)
          && shf "%s %s > %s"
            command_prefix
            (String.concat ~sep:" "
               (List.map vcfs ~f:(fun t -> Filename.quote t#product#path)))
            final_vcf
        ) in
    workflow_node ~name
      (make_product final_vcf)
      ~make
      ~edges:(
        on_failure_activate
          (Remove.path_on_host ~host final_vcf)
        :: depends_on Machine.Tool.(ensure vcftools)
        :: List.map ~f:depends_on vcfs
        @ more_edges)

(** Concatenate VCF files.

We use this version where we don't yet have a Machine.t, as in "download_reference_genome.ml". *)


  let vcf_concat_no_machine
      ~host
      ~vcftools
      ~(run_program : Machine.Make_fun.t)
      ?more_edges
      ~make_product
      vcfs
      ~final_vcf =
    vcf_process_n_to_1_no_machine
      ~make_product
      ~host ~vcftools ~run_program ?more_edges ~vcfs ~final_vcf
      "vcf-concat"

(** Sort a VCF file by choromosome position (it uses "vcf-sort" which itself relies on the "sort" unix tool having the "--version-sort" option).

We use this version where we don't yet have a Machine.t, as in "download_reference_genome.ml". *)


  let vcf_sort_no_machine
      ~host
      ~vcftools
      ~(run_program : Machine.Make_fun.t)
      ?more_edges
      ~make_product
      ~src ~dest () =
    let run_program =
      Machine.Make_fun.with_requirements run_program [`Memory `Big] in 
    vcf_process_n_to_1_no_machine
      ~make_product
      ~host ~vcftools ~run_program ?more_edges ~vcfs:[src] ~final_vcf:dest
      "vcf-sort -c"
end

module Variable_tool_paths = struct
  let single_file ~run_with ~tool path =
    let open KEDSL in
    let condition =
      let init = Machine.Tool.init tool in
      let host = Machine.as_host ~with_shell:"bash" run_with in
      let condition_cmd =
        Ketrew_pure.Program.to_single_shell_command
          Program.(init && shf "test -e %s" path)
      in KEDSL.Command.shell ~host condition_cmd
    in
    object
      method is_done = Some (`Command_returns (condition, 0))
    end
end


end