Skip to content

Commit

Permalink
v0.2.5: support for GRCh38 and better variant sorting
Browse files Browse the repository at this point in the history
- Do not remove reference calls or do any chromosome renaming when sorting
  by position with `variant-utils sort-vcf`.
- Add support for GRCh38 by avoiding issues with large numbers of contigs.
- Add ability to only fix sample name in input VCF without doing a full prep.
  • Loading branch information
chapmanb committed Jul 16, 2015
1 parent 6ac30b4 commit aba8787
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 28 deletions.
7 changes: 7 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 0.2.5 (16 July 2015)

- Do not remove reference calls or do any chromosome renaming when sorting
by position with `variant-utils sort-vcf`.
- Add support for GRCh38 by avoiding issues with large numbers of contigs.
- Add ability to only fix sample name in input VCF without doing a full prep.

## 0.2.4 (25 March 2015)

- De-prioritize multi-allele calls where PL and AD values do not match a final,
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ associated with different variant representations.

### Download

The latest release is 0.2.4 (25 March 2015): [bcbio.variation-0.2.4-standalone.jar][dl].
The latest release is 0.2.5 (16 July 2015): [bcbio.variation-0.2.5-standalone.jar][dl].
Run from the command line:

$ java -jar bcbio.variation-VERSION-standalone.jar [arguments]
Expand Down
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject bcbio.variation "0.2.4"
(defproject bcbio.variation "0.2.5"
:description "Toolkit to analyze genomic variation data, built on the GATK with Clojure"
:license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"}
:dependencies [[org.clojure/clojure "1.5.1"]
Expand Down
16 changes: 9 additions & 7 deletions src/bcbio/variation/combine.clj
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
[bcbio.variation.filter.intervals :only [vcf-sample-name select-by-sample]]
[bcbio.variation.haploid :only [diploid-calls-to-haploid]]
[bcbio.variation.multisample :only [get-out-basename multiple-samples?]]
[bcbio.variation.normalize :only [prep-vcf clean-problem-vcf]]
[bcbio.variation.normalize :only [prep-vcf clean-problem-vcf fix-vcf-sample]]
[bcbio.variation.phasing :only [is-haploid?]]
[bcbio.variation.structural :only [write-non-svs]]
[bcbio.variation.variantcontext :only [get-vcf-header write-vcf-w-template
Expand Down Expand Up @@ -143,12 +143,14 @@
(let [sample-file (if (and (multiple-samples? in-file) (:sample exp))
(run-sample-select in-file (get call :ref (:ref exp)) "")
in-file)
prep-file (if (and (true? (:prep call))
(not= (:ref exp) (:ref call)))
(prep-vcf sample-file (:ref exp) (:sample exp) :out-dir out-dir
:out-fname out-fname :orig-ref-file (:ref call)
:config call)
sample-file)
prep-file (cond (and (true? (:prep call))
(not= (:ref exp) (:ref call)))
(prep-vcf sample-file (:ref exp) (:sample exp) :out-dir out-dir
:out-fname out-fname :orig-ref-file (:ref call)
:config call)
(true? (:fix-sample-header call)) (fix-vcf-sample sample-file (:sample exp) (:ref exp)
:out-dir out-dir :ext ".vcf")
:else sample-file)
hap-file (if (true? (:make-haploid call))
(diploid-calls-to-haploid prep-file (:ref exp) :out-dir out-dir)
prep-file)
Expand Down
44 changes: 27 additions & 17 deletions src/bcbio/variation/normalize.clj
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@
(assoc coll x x))))
rename-map vcf-chrs)))

(defmethod chr-name-remap :default
[map-key ref-file orig-ref-file]
{})

;; ## Resort and normalize variants

(defn- fix-vc
Expand Down Expand Up @@ -328,16 +332,17 @@

(defn fix-vcf-sample
"Update a VCF file with one item to have the given sample name."
[in-file sample ref]
(let [out-file (fsp/add-file-part in-file "samplefix")]
[in-file sample ref & {:keys [out-dir ext]}]
(let [out-file (fsp/add-file-part in-file "samplefix" out-dir ext)]
(when (itx/needs-run? out-file)
(with-open [vcf-iter (gvc/get-vcf-iterator in-file ref)]
(gvc/write-vcf-w-template in-file {:out out-file}
(map #(:vc (fix-vc sample {} %)) (gvc/parse-vcf vcf-iter))
ref :header-update-fn (update-header sample {}))))
ref :header-update-fn (update-header sample {:fix-sample-header true}))))
out-file))

(defn- write-prepped-vcf

"Write VCF file with correctly ordered and cleaned variants."
[vcf-file out-info ref-file orig-ref-file sample config]
(itx/with-temp-dir [tmp-dir (fs/parent (:out out-info))]
Expand Down Expand Up @@ -395,18 +400,21 @@
(defn- ref-base-getter
"Given a VCF line, retrieve the reference base. Second optional
function allows you adjust which base is retrieved to get previous
bases for indels missing reference padding."
bases for indels missing reference padding.
Skips getting a reference getter for large numbers of contigs (GRCh38)
which are crazy slow."
[ref-file]
(let [chr-map (prep-rename-map :GRCh37 ref-file)
get-ref-chrom (fn [chrom]
(get chr-map chrom chrom))]
(fn [xs adjust-fn extra-bases]
(let [i (adjust-fn (Integer/parseInt (second xs)))]
(string/upper-case
(str
(or (extract-sequence ref-file (get-ref-chrom (first xs)) i (+ i extra-bases))
(extract-sequence ref-file (first xs) i (+ i extra-bases))
"N")))))))
(when (< (count (get-seq-name-map ref-file)) 1000)
(let [chr-map (prep-rename-map :GRCh37 ref-file)
get-ref-chrom (fn [chrom]
(get chr-map chrom chrom))]
(fn [xs adjust-fn extra-bases]
(let [i (adjust-fn (Integer/parseInt (second xs)))]
(string/upper-case
(str
(or (extract-sequence ref-file (get-ref-chrom (first xs)) i (+ i extra-bases))
(extract-sequence ref-file (first xs) i (+ i extra-bases))
"N"))))))))

(defn- maybe-add-indel-pad-base
"Check reference and alt alleles for lack of a padding base on indels.
Expand All @@ -432,9 +440,9 @@
(assoc 4 (string/join ","
(map #(str base (subs % 1)) (:alts a)))))))
(no-pad? [a]
(some #(and (not (is-alt-sv? %))
(not= (first (:ref a)) (first %)))
(:alts a)))
(every? #(and (not (is-alt-sv? %))
(not= (first (:ref a)) (first %)))
(:alts a)))
(fix-nopad [xs a]
(let [base (ref-base-get xs dec 0)]
(-> xs
Expand All @@ -445,6 +453,7 @@
(if (empty? xs) []
(let [alleles (get-ref-alts xs)]
(cond
(nil? ref-base-get) xs
(and (indel? alleles) (is-5pad-n? alleles)) (fix-5pad-n xs alleles)
(and (indel? alleles) (no-pad? alleles)) (fix-nopad xs alleles)
:else xs)))))
Expand All @@ -461,6 +470,7 @@
(every? check-bases (string/upper-case real-ref))
(not= (string/upper-case vc-ref) (string/upper-case real-ref)))))]
(cond
(nil? ref-base-get) xs
(empty? xs) []
(is-bad-ref? xs) []
:else xs)))
Expand Down
4 changes: 3 additions & 1 deletion src/bcbio/variation/utils/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
(cli args
["-s" "--sortpos" "Sort by position" :flag true])]
(normalize/prep-vcf vcf-file ref-file nil
:config {:prep-sort-pos (:sortpos options)})))
:config {:prep-sort-pos (:sortpos options)
:remove-refcalls false
:prep-org nil})))

(def ^{:private true} progs
{:callsummary callsummary/annotate-with-callsummary
Expand Down
2 changes: 1 addition & 1 deletion test/bcbio/variation/test/api.clj
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@

(facts "Index and retrieve metrics using Gemini."
(set-config-from-file! web-yaml)
(let [raw-out {"in_public" #{"dbSNP"}, "zygosity" #{"homozygous"},
(let [raw-out {"zygosity" #{"homozygous"},
"type" #{"transition" "snp"}, :id ["MT" 73 "G"]}
raw-ids [:id]]
(when-let [idx (gemini/index-variant-file vcf1 ref)]
Expand Down

0 comments on commit aba8787

Please sign in to comment.