struct
(**************************************************************************)
(*  Copyright 2014, Sebastien Mondet <seb@mondet.org>                     *) (*                                                                        *) (*  Licensed under the Apache License, Version 2.0 (the "License");       *) (*  you may not use this file except in compliance with the License.      *) (*  You may obtain a copy of the License at                               *) (*                                                                        *) (*      http://www.apache.org/licenses/LICENSE-2.0                        *) (*                                                                        *) (*  Unless required by applicable law or agreed to in writing, software   *) (*  distributed under the License is distributed on an "AS IS" BASIS,     *) (*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *) (*  implied.  See the License for the specific language governing         *) (*  permissions and limitations under the License.                        *)
(**************************************************************************)
open Common open KEDSL module Tool = struct   module Definition = struct     type t = {name: string; version: string option}     let create ?version name  = {name; version}     let to_opam_name {name; version} =       sprintf "%s.%s" name (Option.value ~default:"NOVERSION" version)     let to_string = to_opam_name     let to_directory_name = to_opam_name     let get_version t = t.version     let get_name t = t.name   end   module Default = struct     open Definition     let bwa = create "bwa" ~version:"0.7.10"     let freebayes = create "freebayes" ~version:"1.1.0"     let sambamba = create "sambamba" ~version:"0.6.5"     let samtools = create "samtools" ~version:"1.4"     let bcftools = create "bcftools" ~version:"1.4"     let vcftools = create "vcftools" ~version:"0.1.12b"     let bedtools = create "bedtools" ~version:"2.23.0"     let somaticsniper = create "somaticsniper" ~version:"1.0.3"     let varscan = create "varscan" ~version:"2.3.5"     let mutect = create "mutect" (* We don't know the versions of the users' GATKs *)     let gatk = create "gatk" (* idem, because of their non-open-source licenses *)     let strelka = create "strelka" ~version:"1.0.14"     let virmid = create "virmid" ~version:"1.1.1"     let muse = create "muse" ~version:"1.0b"     let star = create "star" ~version:"2.4.1d"     let stringtie = create "stringtie" ~version:"1.2.2"     let cufflinks = create "cufflinks" ~version:"2.2.1"     let hisat = create "hisat" ~version:"0.1.6-beta"     let hisat2 = create "hisat" ~version:"2.0.2-beta"     let mosaik = create "mosaik" ~version:"2.2.3"     let kallisto = create "kallisto" ~version:"0.42.3"     let bowtie = create "bowtie" ~version:"1.1.2"     let fastqc = create "fastqc" ~version:"0.11.5"     let igvxml = create "igvxml" ~version:"0.1.0"     let hlarp = create "hlarp" ~version:"biokepi-branch"     let samblaster = create "samblaster" ~version:"v.0.1.22"     let delly2 = create "delly2" ~version:"0.7.7"     (* Bioconda *)     let optitype = create "optitype" ~version:"1.2.1-0"     let seqtk = create "seqtk" ~version:"1.2"     let seq2hla = create "seq2hla" ~version:"2.2"     let picard = create "picard" ~version:"2.9.2"     let snpeff = create "snpeff" ~version:"4.3.1m-0"     (* PyPI packages *)     let pyensembl = create "pyensembl" ~version:"1.1.0"     let vcfannotatepolyphen = create "vcf-annotate-polyphen" ~version:"0.1.2"     let topiary = create "topiary" ~version:"1.2.1"     let vaxrank = create "vaxrank" ~version:"0.6.0"     let isovar = create "isovar" ~version:"0.7.0"   end   type t = {     definition: Definition.t;     init: Program.t;     ensure: phony_workflow;   }   let create ?init ?ensure definition = {     definition;     init =       Option.value init         ~default:(Program.shf "echo 'Tool %s: default init'"                     (Definition.to_string definition));     ensure =       Option.value_map         ensure         ~f:KEDSL.forget_product         ~default:(workflow_node nothing                     ~name:(sprintf "%s-ensured"                              (Definition.to_string definition)));   }   let init t = t.init   let ensure t = t.ensure   module Kit = struct     type tool = t     type t = Definition.t -> tool option     let concat : t list -> t =       fun l ->       fun def ->         List.find_map l ~f:(fun kit -> kit def)     let of_list l : t =       fun def ->         List.find l ~f:(fun {definition; _} -> definition = def)     let get_exn t tool =       match t tool with       | Some s -> s       | None ->         failwithf "Toolkit cannot provide the tool %s"           (Definition.to_string tool)   end end
(** Jobs in Biokepi ask the computing environment (defined below in Machine) for resources.

The implementation of the Make_fun.t function defined by the user is free to interpret those requirements according to the user's computing infrastructure. *)

module Make_fun = struct   module Requirement = struct     type t = [       | `Processors of int 
          (** A number of cores on a shared-memory setting. *)
      | `Internet_access 
        (** Able to access public HTTP(S) or FTP URLs. *)
      | `Memory of [           | `GB of float 
              (** Ask for a specific amount of memory. *)
          | `Small 
            (** Tell that the program does not expect HPC-like memory usage (i.e. not more than 2 GB or your usual laptop). *)
          | `Big 
            (** Tell that the program may ask for a lot of memory but you don't know how much precisely. *)
        ]       | `Quick_run 
        (** Programs that run fast, with little resources. Usually, you can interpret this as "OK to run on the login node of my cluster." *)
      | `Spark of string list 
           (** Ask for a Spark (on-YARN) environment with custom parameters (not in use for now, "#WIP"). *)
      | `Custom of string 
          (** Pass arbitrary data (useful for temporary extensions/experiements outside of Biokepi). *)
      | `Self_identification of string list         
        (** Set of names or tags for a workflow-node program to identify itself to the Machine.t. This is useful for quickly bypassing incorrect requirements set in the library (please also report an issue if you need this). *)
    ] [@@deriving yojson, show]   end   type t =     ?name: string ->     ?requirements: Requirement.t list ->     Program.t ->     KEDSL.Build_process.t   
  (** The type of the “run function” used across the library. *)
  
  (** A stream processor, for this purpose, is a program that runs on one core and does not grow in memory arbitrarily. *)
  let stream_processor requirements =     `Processors 1 :: `Memory `Small :: requirements   let quick requirements = `Quick_run :: requirements   let downloading requirements =     `Internet_access :: stream_processor requirements   let with_self_ids ?self_ids l =     match self_ids with     | Some tags -> `Self_identification tags :: l     | None -> l   let with_requirements : t -> Requirement.t list -> t = fun f l ->     fun ?name ?(requirements = []) prog ->       f ?name ~requirements:(l @ requirements) prog end type t = {   name: string;   host: Host.t;   pyensembl_cache_dir: string option;   get_reference_genome: string -> Reference_genome.t;   toolkit: Tool.Kit.t;   run_program: Make_fun.t;   work_dir: string;   max_processors: int; } let create     ~host ?pyensembl_cache_dir ~get_reference_genome ~toolkit     ~run_program ~work_dir ~max_processors  name =   {name; toolkit; pyensembl_cache_dir; get_reference_genome;    host; run_program; work_dir; max_processors} let name t = t.name let as_host ?with_shell t =   match with_shell with   | None -> t.host   | Some shell ->     begin       let open Ketrew_pure in       let shell_key = "shell" in       let org_uri = Host.to_uri t.host in       let uri_no_shell = Uri.remove_query_param org_uri shell_key in       let uri_with_shell =         let shell_str = sprintf "%s,-c" shell in (* as in `bash -c` *)         Uri.add_query_param uri_no_shell (shell_key, [shell_str;])       in       KEDSL.Host.parse (Uri.to_string uri_with_shell)     end let get_pyensembl_cache_dir t = t.pyensembl_cache_dir let get_reference_genome t = t.get_reference_genome let get_tool t tool =   match t.toolkit tool with   | Some s -> s   | None ->     failwithf "Machine %S cannot provide the tool %s"       t.name (Tool.Definition.to_string tool) let run_program t = t.run_program let max_processors t = t.max_processors
(** Get the maximum number of processors that a single job can use in the Machine.t (i.e. usually the “number-of-threads” paramters of most tools) *)
let quick_run_program t : Make_fun.t =   Make_fun.with_requirements t.run_program (Make_fun.quick [])
(** Run a program that does not use much memory and runs on one core. *)
let run_stream_processor ?self_ids t : Make_fun.t =   Make_fun.with_requirements t.run_program     (Make_fun.stream_processor [] |> Make_fun.with_self_ids ?self_ids)
(** Run a program that does not use much memory, runs on one core, and needs the internet. *)
let run_download_program t : Make_fun.t =   Make_fun.with_requirements t.run_program (Make_fun.downloading []) let run_big_program t :   ?processors: int -> ?self_ids : string list -> Make_fun.t =   fun ?(processors = 1) ?self_ids ->     Make_fun.with_requirements       t.run_program       (Make_fun.with_self_ids ?self_ids [`Memory `Big`Processors processors]) let work_dir t = t.work_dir end