: sig
open Common
type name = string
module Specification : sig
module Location : sig
type t = [
| `Url of string
| `Vcf_concat of (string * t) list
| `Concat of t list
| `Gunzip of t
| `Bunzip2 of t
| `Untar of t
]
val url : 'a -> [> `Url of 'a ]
val vcf_concat : 'a -> [> `Vcf_concat of 'a ]
val concat : 'a -> [> `Concat of 'a ]
val gunzip : 'a -> [> `Gunzip of 'a ]
val bunzip2 : 'a -> [> `Bunzip2 of 'a ]
val untar : 'a -> [> `Untar of 'a ]
end
type t = private {
name : name;
ensembl : int;
species : string;
metadata : string option;
fasta : Location.t;
dbsnp : Location.t option;
known_indels : Location.t option;
cosmic : Location.t option;
exome_gtf : Location.t option;
cdna : Location.t option;
whess : Location.t option;
major_contigs : string list option;
snpeff_name : string option;
}
val create :
?metadata:string ->
fasta:Location.t ->
ensembl:int ->
species:string ->
?dbsnp:Location.t ->
?known_indels:Location.t ->
?cosmic:Location.t ->
?exome_gtf:Location.t ->
?cdna:Location.t ->
?whess:Location.t ->
?major_contigs:string list ->
?snpeff_name:string ->
string ->
t
module Default :
sig
module Name : sig
val b37 : name
val b37decoy : name
val b38 : name
val hg38: name
val hg18 : name
val hg19 : name
val mm10 : name
end
val b37 : t
val b37decoy : t
val b38 : t
val hg38 : t
val hg18 : t
val hg19 : t
val mm10 : t
end
end
type t = private {
specification: Specification.t;
location : KEDSL.file_workflow;
cosmic : KEDSL.file_workflow option;
dbsnp : KEDSL.file_workflow option;
known_indels : KEDSL.file_workflow option;
gtf : KEDSL.file_workflow option;
cdna : KEDSL.file_workflow option;
whess : KEDSL.file_workflow option;
}
val create :
?cosmic:KEDSL.file_workflow ->
?dbsnp:KEDSL.file_workflow ->
?known_indels:KEDSL.file_workflow ->
?gtf:KEDSL.file_workflow ->
?cdna:KEDSL.file_workflow ->
?whess:KEDSL.file_workflow ->
Specification.t -> KEDSL.file_workflow -> t
val name : t -> name
val ensembl : t -> int
val species : t -> string
val path : t -> string
val cosmic_path_exn : t -> string
val dbsnp_path_exn : t -> string
val known_indels_path_exn : t -> string
val gtf_path_exn : t -> string
val cdna_path_exn : t -> string
val whess_path_exn : t -> string
val snpeff_name_exn: t -> string
val major_contigs : t -> Region.t list
val fasta: t -> KEDSL.file_workflow
val cosmic_exn: t -> KEDSL.file_workflow
val dbsnp_exn: t -> KEDSL.file_workflow
val known_indels_exn: t -> KEDSL.file_workflow
val gtf_exn: t -> KEDSL.file_workflow
val gtf: t -> KEDSL.file_workflow option
val cdna_exn: t -> KEDSL.file_workflow
val whess_exn: t -> KEDSL.file_workflow
end
= struct
open Common
type name = string
module Specification = struct
module Location = struct
type t = [
| `Url of string
| `Vcf_concat of (string * t) list
| `Concat of t list
| `Gunzip of t
| `Bunzip2 of t
| `Untar of t
]
let url u = `Url u
let vcf_concat l = `Vcf_concat l
let concat l = `Concat l
let gunzip l = `Gunzip l
let bunzip2 l = `Bunzip2 l
let untar l = `Untar l
end
type t = {
name: string;
ensembl: int;
species: string;
metadata: string option;
fasta: Location.t;
dbsnp: Location.t option;
known_indels: Location.t option;
cosmic: Location.t option;
exome_gtf: Location.t option;
cdna: Location.t option;
whess: Location.t option;
major_contigs: string list option;
snpeff_name: string option;
}
let create
?metadata
~fasta
~ensembl
~species
?dbsnp
?known_indels
?cosmic
?exome_gtf
?cdna
?whess
?major_contigs
?snpeff_name
name = {
name;
ensembl;
species;
metadata;
fasta;
dbsnp;
known_indels;
cosmic;
exome_gtf;
cdna;
whess;
major_contigs;
snpeff_name;
}
module Default = struct
let major_contigs_b37 =
List.init 22 (fun i -> sprintf "%d" (i + 1))
@ ["X"; "Y"; "MT";]
let major_contigs_hg_family =
List.init 22 (fun i -> sprintf "chr%d" (i + 1))
@ [
"chrX";
"chrY";
"chrM";
]
let major_contigs_mm10 =
List.init 19 (fun i -> sprintf "%d" (i + 1))
@ [ "X"; "Y" ]
module Name = struct
let b37 = "b37"
let b37decoy = "b37decoy"
let b38 = "b38"
let hg38 = "hg38"
let hg18 = "hg18"
let hg19 = "hg19"
let mm10 = "mm10"
end
let b37_dbsnp_url =
"https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/dbsnp_138.b37.vcf.gz"
let b37_cosmic_url =
"http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/b37_cosmic_v54_120711.vcf"
let b37_exome_gtf_url =
"http://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz"
let b37_cdna_url =
"http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.75.cdna.all.fa.gz"
let b37_whess_url =
"ftp://genetics.bwh.harvard.edu/pph2/whess/polyphen-2.2.2-whess-2011_12.sqlite.bz2"
let b37_known_indels_url =
"https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/Mills_and_1000G_gold_standard.indels.b37.vcf.gz"
let human = "homo sapiens"
let mouse = "mus musculus"
let b37 =
create Name.b37
~species:human
~ensembl:75
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(
url "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/human_g1k_v37.fasta.gz"
|> gunzip)
~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
~known_indels:Location.(url b37_known_indels_url |> gunzip)
~cosmic:Location.(url b37_cosmic_url)
~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
~cdna:Location.(url b37_cdna_url |> gunzip)
~whess:Location.(url b37_whess_url |> bunzip2)
~snpeff_name:"GRCh37.75"
let b37decoy =
create Name.b37decoy
~species:human
~ensembl:75
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(
url
"ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
|> gunzip)
~dbsnp:Location.(url b37_dbsnp_url |> gunzip)
~known_indels:Location.(url b37_known_indels_url |> gunzip)
~exome_gtf:Location.(url b37_exome_gtf_url |> gunzip)
~cosmic:Location.(url b37_cosmic_url)
~cdna:Location.(url b37_cdna_url |> gunzip)
~whess:Location.(url b37_whess_url |> bunzip2)
~snpeff_name:"GRCh37.75"
let hg38 =
let hg38_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.fasta.gz" in
let dbsnp_hg38 =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.dbsnp.vcf.gz" in
let known_indels_hg38 =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/hg38bundle/Homo_sapiens_assembly38.known_indels.vcf.gz" in
create Name.hg38
~species:human
~ensembl:87
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg38_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg38 |> gunzip)
~known_indels:Location.(url known_indels_hg38 |> gunzip)
~snpeff_name:"GRCh38.86"
let b38 =
let b38_url =
"http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" in
let gtf_b38_url =
"http://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz" in
let cdna_b38_url =
"http://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" in
let dbsnp_url =
"http://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/common_all_20170710.vcf.gz" in
create Name.b38
~species:human
~ensembl:87
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_b37
~fasta:Location.(url b38_url |> gunzip)
~exome_gtf:Location.(url gtf_b38_url |> gunzip)
~dbsnp:Location.(url dbsnp_url |> gunzip)
~cdna:Location.(url cdna_b38_url |> gunzip)
~snpeff_name:"GRCh38.86"
let hg18 =
let hg18_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/Homo_sapiens_assembly18.fasta.gz" in
let dbsnp_hg18_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg18/dbsnp_138.hg18.vcf.gz" in
create Name.hg18
~ensembl:54
~species:human
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg18_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg18_url |> gunzip)
let hg19 =
let hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/ucsc.hg19.fasta.gz" in
let dbsnp_hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/dbsnp_138.hg19.vcf.gz" in
let known_indels_hg19_url =
"ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" in
create Name.hg19
~ensembl:75
~species:human
~metadata:"Provided by the Biokepi library"
~major_contigs:major_contigs_hg_family
~fasta:Location.(url hg19_url|> gunzip)
~dbsnp:Location.(url dbsnp_hg19_url |> gunzip)
~known_indels:Location.(url known_indels_hg19_url |> gunzip)
~whess:Location.(url b37_whess_url |> bunzip2)
~snpeff_name:"hg19"
let mm10 =
let mm10_url =
"https://storage.googleapis.com/hammerlab-biokepi-data/raw_data/mm10.GRCm38.dna_sm.fa" in
let dbsnp_mm10_snps_url =
"ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.snps.rsIDdbSNPv137.vcf.gz" in
let dbsnp_mm10_indels_url =
"ftp://ftp-mouse.sanger.ac.uk/REL-1303-SNPs_Indels-GRCm38/mgp.v3.indels.rsIDdbSNPv137.vcf.gz" in
let gene_annotations_gtf =
"ftp://ftp.ensembl.org/pub/release-84/gtf/mus_musculus/Mus_musculus.GRCm38.84.gtf.gz" in
let cdna_mm10_url =
"ftp://ftp.ensembl.org/pub/release-84/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" in
create Name.mm10
~ensembl:87
~species:mouse
~metadata:"Provided by the Biokepi Library"
~major_contigs:major_contigs_mm10
~fasta:Location.(url mm10_url |> gunzip)
~dbsnp:Location.(
vcf_concat ["db_snps.vcf", url dbsnp_mm10_snps_url |> gunzip;
"db_indels.vcf", url dbsnp_mm10_indels_url |> gunzip]
)
~exome_gtf:Location.(url gene_annotations_gtf |> gunzip)
~cdna:Location.(url cdna_mm10_url |> gunzip)
~snpeff_name:"mm10"
end
end
type t = {
specification: Specification.t;
location: KEDSL.file_workflow;
cosmic: KEDSL.file_workflow option;
dbsnp: KEDSL.file_workflow option;
known_indels: KEDSL.file_workflow option;
gtf: KEDSL.file_workflow option;
cdna: KEDSL.file_workflow option;
whess: KEDSL.file_workflow option;
}
let create ?cosmic ?dbsnp ?known_indels ?gtf ?cdna ?whess specification location =
{specification; location; cosmic; dbsnp; known_indels; gtf; cdna; whess}
let name t = t.specification.Specification.name
let ensembl t = t.specification.Specification.ensembl
let species t = t.specification.Specification.species
let snpeff_name_exn t =
Option.value_exn
~msg:(sprintf "%s: no snpEff name" (name t))
t.specification.Specification.snpeff_name
let path t = t.location#product#path
let cosmic_path_exn t =
let msg = sprintf "cosmic_path_exn of %s" (name t) in
let cosmic = Option.value_exn ~msg t.cosmic in
cosmic#product#path
let dbsnp_path_exn t =
let msg = sprintf "dbsnp_path_exn of %s" (name t) in
let trgt = Option.value_exn ~msg t.dbsnp in
trgt#product#path
let known_indels_path_exn t =
let msg = sprintf "known_indels_path_exn of %s" (name t) in
let trgt = Option.value_exn ~msg t.known_indels in
trgt#product#path
let gtf_path_exn t =
let msg = sprintf "gtf_path_exn of %s" (name t) in
let trgt = Option.value_exn ~msg t.gtf in
trgt#product#path
let cdna_path_exn t =
let msg = sprintf "cdna_path_exn of %s" (name t) in
let target = Option.value_exn ~msg t.cdna in
target#product#path
let whess_path_exn t =
let msg = sprintf "whess_path_exn of %s" (name t) in
let target = Option.value_exn ~msg t.whess in
target#product#path
let fasta: t -> KEDSL.file_workflow = fun t -> t.location
let cosmic_exn t =
Option.value_exn ~msg:(sprintf "%s: no COSMIC" (name t)) t.cosmic
let dbsnp_exn t =
Option.value_exn ~msg:(sprintf "%s: no DBSNP" (name t)) t.dbsnp
let known_indels_exn t =
Option.value_exn ~msg:(sprintf "%s: no Known Indels" (name t)) t.known_indels
let gtf_exn t =
Option.value_exn ~msg:(sprintf "%s: no GTF" (name t)) t.gtf
let gtf t = t.gtf
let cdna_exn t =
Option.value_exn ~msg:(sprintf "%s: no cDNA fasta file" (name t)) t.cdna
let whess_exn t =
Option.value_exn ~msg:(sprintf "%s: no WHESS file" (name t)) t.whess
let major_contigs t : Region.t list =
match t.specification.Specification.major_contigs with
| None ->
failwithf "Reference %S does have major-contigs/chromosomes defined" (name t)
| Some l -> List.map l ~f:(fun s -> `Chromosome s)
end