From aba8787edb6c62a51191c0ff8619b4835f90bf31 Mon Sep 17 00:00:00 2001 From: chapmanb Date: Thu, 16 Jul 2015 07:38:10 -0400 Subject: [PATCH] v0.2.5: support for GRCh38 and better variant sorting - Do not remove reference calls or do any chromosome renaming when sorting by position with `variant-utils sort-vcf`. - Add support for GRCh38 by avoiding issues with large numbers of contigs. - Add ability to only fix sample name in input VCF without doing a full prep. --- HISTORY.md | 7 +++++ README.md | 2 +- project.clj | 2 +- src/bcbio/variation/combine.clj | 16 ++++++----- src/bcbio/variation/normalize.clj | 44 ++++++++++++++++++------------ src/bcbio/variation/utils/core.clj | 4 ++- test/bcbio/variation/test/api.clj | 2 +- 7 files changed, 49 insertions(+), 28 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 34e8be7..922396a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,10 @@ +## 0.2.5 (16 July 2015) + +- Do not remove reference calls or do any chromosome renaming when sorting + by position with `variant-utils sort-vcf`. +- Add support for GRCh38 by avoiding issues with large numbers of contigs. +- Add ability to only fix sample name in input VCF without doing a full prep. + ## 0.2.4 (25 March 2015) - De-prioritize multi-allele calls where PL and AD values do not match a final, diff --git a/README.md b/README.md index a195494..f350ac2 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ associated with different variant representations. ### Download -The latest release is 0.2.4 (25 March 2015): [bcbio.variation-0.2.4-standalone.jar][dl]. +The latest release is 0.2.5 (16 July 2015): [bcbio.variation-0.2.5-standalone.jar][dl]. Run from the command line: $ java -jar bcbio.variation-VERSION-standalone.jar [arguments] diff --git a/project.clj b/project.clj index 9ceb010..148f63a 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject bcbio.variation "0.2.4" +(defproject bcbio.variation "0.2.5" :description "Toolkit to analyze genomic variation data, built on the GATK with Clojure" :license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"} :dependencies [[org.clojure/clojure "1.5.1"] diff --git a/src/bcbio/variation/combine.clj b/src/bcbio/variation/combine.clj index a98390f..89f6d9f 100644 --- a/src/bcbio/variation/combine.clj +++ b/src/bcbio/variation/combine.clj @@ -13,7 +13,7 @@ [bcbio.variation.filter.intervals :only [vcf-sample-name select-by-sample]] [bcbio.variation.haploid :only [diploid-calls-to-haploid]] [bcbio.variation.multisample :only [get-out-basename multiple-samples?]] - [bcbio.variation.normalize :only [prep-vcf clean-problem-vcf]] + [bcbio.variation.normalize :only [prep-vcf clean-problem-vcf fix-vcf-sample]] [bcbio.variation.phasing :only [is-haploid?]] [bcbio.variation.structural :only [write-non-svs]] [bcbio.variation.variantcontext :only [get-vcf-header write-vcf-w-template @@ -143,12 +143,14 @@ (let [sample-file (if (and (multiple-samples? in-file) (:sample exp)) (run-sample-select in-file (get call :ref (:ref exp)) "") in-file) - prep-file (if (and (true? (:prep call)) - (not= (:ref exp) (:ref call))) - (prep-vcf sample-file (:ref exp) (:sample exp) :out-dir out-dir - :out-fname out-fname :orig-ref-file (:ref call) - :config call) - sample-file) + prep-file (cond (and (true? (:prep call)) + (not= (:ref exp) (:ref call))) + (prep-vcf sample-file (:ref exp) (:sample exp) :out-dir out-dir + :out-fname out-fname :orig-ref-file (:ref call) + :config call) + (true? (:fix-sample-header call)) (fix-vcf-sample sample-file (:sample exp) (:ref exp) + :out-dir out-dir :ext ".vcf") + :else sample-file) hap-file (if (true? (:make-haploid call)) (diploid-calls-to-haploid prep-file (:ref exp) :out-dir out-dir) prep-file) diff --git a/src/bcbio/variation/normalize.clj b/src/bcbio/variation/normalize.clj index 14d4d89..3acf44b 100644 --- a/src/bcbio/variation/normalize.clj +++ b/src/bcbio/variation/normalize.clj @@ -128,6 +128,10 @@ (assoc coll x x)))) rename-map vcf-chrs))) +(defmethod chr-name-remap :default + [map-key ref-file orig-ref-file] + {}) + ;; ## Resort and normalize variants (defn- fix-vc @@ -328,16 +332,17 @@ (defn fix-vcf-sample "Update a VCF file with one item to have the given sample name." - [in-file sample ref] - (let [out-file (fsp/add-file-part in-file "samplefix")] + [in-file sample ref & {:keys [out-dir ext]}] + (let [out-file (fsp/add-file-part in-file "samplefix" out-dir ext)] (when (itx/needs-run? out-file) (with-open [vcf-iter (gvc/get-vcf-iterator in-file ref)] (gvc/write-vcf-w-template in-file {:out out-file} (map #(:vc (fix-vc sample {} %)) (gvc/parse-vcf vcf-iter)) - ref :header-update-fn (update-header sample {})))) + ref :header-update-fn (update-header sample {:fix-sample-header true})))) out-file)) (defn- write-prepped-vcf + "Write VCF file with correctly ordered and cleaned variants." [vcf-file out-info ref-file orig-ref-file sample config] (itx/with-temp-dir [tmp-dir (fs/parent (:out out-info))] @@ -395,18 +400,21 @@ (defn- ref-base-getter "Given a VCF line, retrieve the reference base. Second optional function allows you adjust which base is retrieved to get previous - bases for indels missing reference padding." + bases for indels missing reference padding. + Skips getting a reference getter for large numbers of contigs (GRCh38) + which are crazy slow." [ref-file] - (let [chr-map (prep-rename-map :GRCh37 ref-file) - get-ref-chrom (fn [chrom] - (get chr-map chrom chrom))] - (fn [xs adjust-fn extra-bases] - (let [i (adjust-fn (Integer/parseInt (second xs)))] - (string/upper-case - (str - (or (extract-sequence ref-file (get-ref-chrom (first xs)) i (+ i extra-bases)) - (extract-sequence ref-file (first xs) i (+ i extra-bases)) - "N"))))))) + (when (< (count (get-seq-name-map ref-file)) 1000) + (let [chr-map (prep-rename-map :GRCh37 ref-file) + get-ref-chrom (fn [chrom] + (get chr-map chrom chrom))] + (fn [xs adjust-fn extra-bases] + (let [i (adjust-fn (Integer/parseInt (second xs)))] + (string/upper-case + (str + (or (extract-sequence ref-file (get-ref-chrom (first xs)) i (+ i extra-bases)) + (extract-sequence ref-file (first xs) i (+ i extra-bases)) + "N")))))))) (defn- maybe-add-indel-pad-base "Check reference and alt alleles for lack of a padding base on indels. @@ -432,9 +440,9 @@ (assoc 4 (string/join "," (map #(str base (subs % 1)) (:alts a))))))) (no-pad? [a] - (some #(and (not (is-alt-sv? %)) - (not= (first (:ref a)) (first %))) - (:alts a))) + (every? #(and (not (is-alt-sv? %)) + (not= (first (:ref a)) (first %))) + (:alts a))) (fix-nopad [xs a] (let [base (ref-base-get xs dec 0)] (-> xs @@ -445,6 +453,7 @@ (if (empty? xs) [] (let [alleles (get-ref-alts xs)] (cond + (nil? ref-base-get) xs (and (indel? alleles) (is-5pad-n? alleles)) (fix-5pad-n xs alleles) (and (indel? alleles) (no-pad? alleles)) (fix-nopad xs alleles) :else xs))))) @@ -461,6 +470,7 @@ (every? check-bases (string/upper-case real-ref)) (not= (string/upper-case vc-ref) (string/upper-case real-ref)))))] (cond + (nil? ref-base-get) xs (empty? xs) [] (is-bad-ref? xs) [] :else xs))) diff --git a/src/bcbio/variation/utils/core.clj b/src/bcbio/variation/utils/core.clj index 83d01bc..e7b8488 100644 --- a/src/bcbio/variation/utils/core.clj +++ b/src/bcbio/variation/utils/core.clj @@ -17,7 +17,9 @@ (cli args ["-s" "--sortpos" "Sort by position" :flag true])] (normalize/prep-vcf vcf-file ref-file nil - :config {:prep-sort-pos (:sortpos options)}))) + :config {:prep-sort-pos (:sortpos options) + :remove-refcalls false + :prep-org nil}))) (def ^{:private true} progs {:callsummary callsummary/annotate-with-callsummary diff --git a/test/bcbio/variation/test/api.clj b/test/bcbio/variation/test/api.clj index 2476165..e872e71 100644 --- a/test/bcbio/variation/test/api.clj +++ b/test/bcbio/variation/test/api.clj @@ -62,7 +62,7 @@ (facts "Index and retrieve metrics using Gemini." (set-config-from-file! web-yaml) - (let [raw-out {"in_public" #{"dbSNP"}, "zygosity" #{"homozygous"}, + (let [raw-out {"zygosity" #{"homozygous"}, "type" #{"transition" "snp"}, :id ["MT" 73 "G"]} raw-ids [:id]] (when-let [idx (gemini/index-variant-file vcf1 ref)]