Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs in mpileup. #27

Merged
merged 2 commits into from
Feb 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/cljam/cigar.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,45 @@
(for [[_ n op] (re-seq #"([0-9]*)([MIDNSHP=X])" s)]
[(Integer/parseInt n) (first op)]))

(defn simplify
"Merge contiguous same operations of parsed CIGAR."
[cigs]
(loop [[[^long l op :as x] & xs] cigs result (transient [])]
(if (and l op)
(let [[^long nl nop] (first xs)]
(if (= op nop)
(recur (cons [(+ l nl) op] (next xs)) result)
(recur xs (conj! result x))))
(persistent! result))))

(defn- concat! [v coll]
(reduce conj! v coll))

(defn- update-last! [coll f]
(let [c (dec (count coll))]
(if (neg? c)
coll
(let [[op x] (get coll c)]
(if (= :m op)
(assoc! coll c (f x))
coll)))))

(defn to-index*
"Convert CIGAR string to sequence of indices."
[^String s]
(let [cigs (simplify (remove (comp #{\P \H} second) (parse s)))]
(loop [[[^long l op] & xs] cigs r 0 s 0 idx (transient [])]
(if (and l op)
(condp get op
#{\M \= \X} (recur xs (+ l r) (+ l s) (concat! idx (map (fn [x] [:m x]) (range s (+ l s)))))
#{\D} (recur xs (+ r l) s (concat! (update-last! idx (fn [x] [:d x (range r (+ l r))])) (repeat l [:m \*])))
#{\N} (recur xs (+ r l) s (concat! idx (repeat l [:m \>])))
#{\S} (recur xs r (+ s l) idx)
#{\I} (recur xs r (+ s l) (update-last! idx (fn [x] [:i x (range s (+ l s))]))))
(persistent! idx)))))

(def to-index (memoize to-index*))

(defn count-op
"Returns length of CIGAR operations."
[^String s]
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/cli.clj
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@
:seq (cstr/join (% line))
(% line)) [:rname :pos :ref :count :seq :qual]))))))
([rdr rname start end]
(doseq [line (plp/mpileup rdr rname start end)]
(doseq [line (plp/mpileup nil rdr rname start end)]
(if-not (zero? (:count line))
(println (cstr/join \tab (map #(case %
:qual (cstr/join (% line))
Expand Down
109 changes: 57 additions & 52 deletions src/cljam/pileup/mpileup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,72 +7,77 @@
[cljam.sequence :as cseq]
[cljam.io :as io]
[cljam.fasta :as fa]
[cljam.cigar :as cig]
[cljam.pileup.common :refer [window-width step center]]
[cljam.pileup.pileup :refer [rpositions]]))

(defn- append-seq
[op target current]
(case op
\M (apply conj current (map str (:seq target)))
\I (if (seq current)
(update-in current [(dec (count current))]
vector
(str "+" (:n target) (apply str (:seq target))))
current)
\D (apply conj current (map str (:seq target)))
\N (apply conj current (map str (:seq target)))
current))
(defn to-mpileup
"Stringify mpileup sequence."
[x]
(if (vector? x)
(let [[y op xs] x] (apply str y op (count xs) xs))
(str x)))

(defn encode-seq [seq*]
(loop [[f & r] (filter #(nil? (#{\P} (:op %))) seq*)
ret []
op nil
tmp {:n 0, :op nil, :seq nil}]
(if (nil? f)
(append-seq op tmp ret)
(if (nil? op)
(recur r ret (:op f) f)
(if (= (:op f) op)
(recur r ret (:op f) (-> tmp
(update-in [:n] + (:n f))
(assoc :op (:op f))
(update-in [:seq] (partial apply conj) (:seq f))))
(let [new-ret (append-seq op tmp ret)]
(recur r new-ret (:op f) f)))))))
(defn substitute-seq
"Substitute sequence with mpileup index."
[[^Character r & refs] ^String s [op x xs]]
(letfn [(get-char [i] (if (number? i) (let [c (.charAt s i)] (if (= c r) \. c)) i))]
(case op
:m (get-char x)
:d [(get-char x) \- (take (count xs) refs)]
:i [(get-char x) \+ (map #(.charAt s %) xs)]
x)))

(defn substitute-qual
"Substitute base quality with mpileup index."
[^String q [op x]]
(if (and (number? x) (not= q "*")) (.charAt q x) \~))

(defn pileup-seq
"Returns a lazy sequence that each element contains reads piled-up at the locus."
[^long start ^long end reads]
(letfn [(cover-locus? [pos aln]
(<= (:pos aln) pos (sam-util/get-end aln)))
(step [[pos buf alns]]
(let [[i o] (split-with (partial cover-locus? (inc pos)) alns)]
[(inc pos) (concat (filter (partial cover-locus? (inc pos)) buf) i) o]))]
(map second (rest (iterate step [(dec start) [] (remove (comp empty? :cigar) reads)])))))
(<= (:pos aln) pos (:end aln)))
(step [pos buf alns]
(lazy-seq
(when (< pos end)
(let [[i o] (split-with (partial cover-locus? (inc pos)) alns)
b (doall (concat (filter #(<= (inc pos) (:end %)) buf) i))]
(cons b (step (inc pos) b o))))))]
(->> reads
(sequence
(comp
(remove #(and (empty? (:cigar %)) (nil? (:cigar-bytes (:meta %)))))
(map #(assoc % :end (sam-util/get-end %)))
(drop-while #(< (:end %) start))))
(step (dec start) []))))

(defrecord MPileupElement [^String rname ^long pos ^Character ref ^long count seq qual reads])

(defn- gen-mpileup
(defn gen-mpileup
"Compute mpileup info from piled-up reads and reference."
[^String rname ^long locus ^Character reference reads]
(let [seq (mapv (fn [{:keys [^long pos ^String seq ^String cigar] :as aln}]
(let [s (nth (encode-seq (cseq/parse seq cigar)) (- locus pos))]
(cond
(vector? s) (second s)
(= reference (first s)) "."
:else s))) reads)
qual (mapv (fn [{:keys [^long pos ^String qual] :as aln}]
(if (= qual "*") \~ (nth qual (- locus pos) \~))) reads)]
(MPileupElement. rname locus reference (count reads) seq qual reads)))
[^String rname ^long locus [^Character ref-base :as refs] reads]
(let [seqs (map (fn gen-mpileup-seq [{:keys [^long pos ^String seq cig-index]}]
(substitute-seq refs seq (nth cig-index (- locus pos)))) reads)
qual (map (fn gen-mpileup-qual [{:keys [^long pos ^String qual cig-index]}]
(substitute-qual qual (nth cig-index (- locus pos)))) reads)]
(MPileupElement. rname locus ref-base (count reads) seqs qual reads)))

(defn pileup*
"Internal pileup function independent from I/O."
[refseq alns rname start end]
(map
(partial gen-mpileup rname)
(range start (inc end))
refseq
(pileup-seq start end alns)))
"Internal mpileup function independent from I/O.
Can take multiple alignments seqs."
[refseq rname start end & aln-seqs]
(->> aln-seqs
(map
(fn [alns]
(->> alns
(sequence (map (fn [a] (assoc a :cig-index (cig/to-index (:cigar a))))))
(pileup-seq start end))))
(apply map
(fn [index refs & plps]
(map (fn [plp] (gen-mpileup rname index refs plp)) plps))
(range start (inc end))
(partition 10 1 (concat refseq (repeat \N))))))

(defn pileup
"Returns a lazy sequence of MPileupElement calculated from FASTA and BAM."
Expand All @@ -89,7 +94,7 @@
(fa/read-sequence fa-reader {:chr rname :start s :end e})
(repeat \N))
alns (io/read-alignments bam-reader {:chr rname :start s :end e :depth :deep})]
(pileup* refseq alns rname s e)))
(map (fn [p] (update (first p) :seq (fn [s] (map to-mpileup s)))) (pileup* refseq rname s e alns))))
(catch bgzf4j.BGZFException _
(throw (RuntimeException. "Invalid file format"))))))

Expand Down
36 changes: 36 additions & 0 deletions test/cljam/t_cigar.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,39 @@

(fact "about parse"
(cgr/parse "1S2I6M1P11I") => '([1 \S] [2 \I] [6 \M] [1 \P] [11 \I]))

(tabular
(fact
"cigar to index"
(map (fn [[op x :as xs]] (if (= op :m) x xs)) (cgr/to-index* ?cigar)) => ?index)
?cigar ?index
"4M" [0 1 2 3]
"1M3I" [[:i 0 [1 2 3]]]
"1M3D" [[:d 0 [1 2 3]] \* \* \*]
"2M3I" [0 [:i 1 [2 3 4]]]
"4D5M" [\* \* \* \* 0 1 2 3 4] ;; ^]* * * * A T G C A$
"4D5I" [\* \* \* [:i \* [0 1 2 3 4]]] ;; TODO: ^]* * * *+5TGCA=$
"4S4M" [4 5 6 7]
"4H4M" [0 1 2 3]
"1M3I1M" [[:i 0 [1 2 3]] 4]
"2M3I1M" [0 [:i 1 [2 3 4]] 5]
"4I2D5M" [\* \* 4 5 6 7 8] ;; ^]* * A T G C A$
"1M4D5I" [[:d 0 [1 2 3 4]] \* \* \* [:i \* [1 2 3 4 5]]] ;; TODO: ^]* * * *+5TGCA=$
"1M3D3M" [[:d 0 [1 2 3]] \* \* \* 1 2 3]
"1M2D1I2M" [[:d 0 [1 2]] \* [:i \* [1]] 2 3] ;; TODO: ^]A-2NN * *+1G G C$
"1M2N1I2M" [0 \> [:i \> [1]] 2 3] ;; TODO: ^]A > >+1G G C$
"1M1I2D2M" [[:i 0 [1]] \* \* 2 3] ;; ^]A+1T * * G C$
"1M1I2N2M" [[:i 0 [1]] \> \> 2 3] ;; ^]A+1T > > G C$
"1M4I2D5M" [[:i 0 [1 2 3 4]] \* \* 5 6 7 8 9] ;; ^]A+4TGCA * * C A T G C$
"1S4I2D5M" [\* \* 5 6 7 8 9] ;; ^]* * T G C A T$
"3M2P2I3M" [0 1 [:i 2 [3 4]] 5 6 7] ;; ^]A T G+2CA T G C$
"1P4I1D5M" [\* 4 5 6 7 8] ;; ^]* A T G C A$
"5M1D3M4S" [0 1 2 3 [:d 4 [5]] \* 5 6 7] ;; ^]A T G C A-1N * T G C$
"5M1N3M4S" [0 1 2 3 4 \> 5 6 7] ;; ^]A T G C A > T G C$
"3H2S3M1D2S4H" [2 3 [:d 4 [3]] \*] ;; ^]G C A-1N \*$
"1M1P1I1P1I2M" [[:i 0 [1 2]] 3 4] ;; ^]A+2TG C A$
"2M1I1D2I2D1M" [0 [:i 1 [2]] [:i \* [3 4]] \* \* 5] ;; TODO: ^]A T+1G *+2AT * * T$
"1S2I6M1P1I1P1I4M2I" [3 4 5 6 7 [:i 8 [9 10]] 11 12 13 [:i 14 [15 16]]] ;; ^]C A T G C A+2TG C A T G+2CA$
"6M14D1I5M" [0 1 2 3 4 [:d 5 [6 7 8 9 10 11 12 13 14 15 16 17 18 19]] \* \* \* \* \* \* \* \* \* \* \* \* \* [:i \* [6]] 7 8 9 10 11] ;; ^]A T G C A T-14NNNNNNNNNNNNNN * * * * * * * * * * * * * *+1C C A T G C$
"6M14N1I5M" [0 1 2 3 4 5 \> \> \> \> \> \> \> \> \> \> \> \> \> [:i \> [6]] 7 8 9 10 11] ;; ^]A T G C A T > > > > > > > > > > > > > >+1C C A T G C$
"1S2M2D3M1D4M3D2M3D3M3I4M3D2M3S" [1 [:d 2 [2 3]] \* \* 3 4 [:d 5 [7]] \* 6 7 8 [:d 9 [12 13 14]] \* \* \* 10 [:d 11 [17 18 19]] \* \* \* 12 13 [:i 14 [15 16 17]] 18 19 20 [:d 21 [27 28 29]] \* \* \* 22 23])
89 changes: 75 additions & 14 deletions test/cljam/t_pileup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,30 @@
cljam.t-common)
(:require [cljam.bam :as bam]
[cljam.fasta :as fa]
[cljam.pileup :as plp]))
[cljam.pileup :as plp]
[cljam.pileup.mpileup :as mplp]))

(def test-bam-pileup-ref '(0 0 0 0 0 0 1 1 3 3 3 3 3 3 2 3 3 3 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 2 2 2 1 1 1 1 1))
(def test-bam-pileup-ref2 '(1 2 2 2 2 3 3 3 3 4 4 5 5 6 6 6 6 6 6 6 5 5 4 4 4 4 4 3 3 3 3 3 3 3 2 1 0 0 0 0))
(def test-bam-mpileup-seq-ref
'(() () () () () () ("T") ("T") ("A" "A" "A") ("G" "G" "G") ("A" "A" "C") ("T" "T" "T")
("A" "A" "A") ("+4AGAG" "+2GG" "A") ("G" "G") ("A" "A" "A") ("T" "T" "T") ("A" "+2AA" "A")
("*" "G") ("C" "C") ("T" "T") ("G" ">") (">") (">") (">") (">") (">") (">") (">" "T") (">" "A")
(">" "G") (">" "G") (">" "C") (">") ("+1C") ("T") ("C" "C") ("A" "A") ("G" "G") ("C" "C")
("G") ("C") ("C") ("A") ("T")))
'(() () () () () () ("T") ("T") ("A" "A" "A") ("G" "G" "G") ("A" "A" "C") ("T" "T" "T") ("A" "A" "A")
("A+4AGAG" "A+2GG" "A") ("G" "G") ("A" "A" "A") ("T" "T" "T") ("A-1N" "A+2AA" "A") ("*" "G") ("C" "C")
("T" "T") ("G" ">") (">") (">") (">") (">") (">") (">") (">" "T") (">" "A") (">" "G") (">" "G") (">" "C")
(">") (">+1C") ;; adjacent indel >+1T
("T") ("C" "C") ("A" "A") ("G" "G") ("C" "C") ("G") ("C") ("C") ("A") ("T")))
(def test-bam-mpileup-seq-ref2
'(() () () () () () (".") (".") ("." "." ".") ("." "." ".") ("." "." "C") ("." "." ".")
("." "." ".") ("+4AGAG" "+2GG" ".") ("." ".") ("." "." ".") ("." "." ".") ("." "+2AA" ".")
("*" ".") ("." ".") ("." ".") ("." ">") (">") (">") (">") (">") (">") (">") (">" ".") (">" ".")
(">" ".") (">" ".") (">" ".") (">") ("+1C") (".") ("." ".") ("." ".") ("." ".") ("." ".")
(".") (".") (".") (".") (".")))
'(("A") ("G" "G") ("G" "G") ("T" "T") ("T" "T") ("T" "T" "T") ("T" "T" "T") ("A" "A" "A") ("T" "T" "T")
("A" "A" "A" "C") ("A" "A" "A" "A") ("A" "A" "A" "A" "A") ("A" "A" "A" "A" "A") ("C" "C" "C+4AAAT" "T" "T" "T")
("A" "A" "A" "A" "A" "A") ("A" "A" "A" "A" "A" "A") ("A" "A" "T" "T" "T" "T") ("T" "T" "T" "T" "T" "T")
("A" "A" "A" "A" "A" "A") ("A" "A" "A" "A" "A" "A") ("T" "G" "G" "G" "G") ("T" "T" "T" "T" "T")
("C" "C" "C" "C") ("T" "T" "T" "T") ("A" "A" "A" "A") ("C" "C" "C" "C") ("A" "A" "A" "A") ("G" "G" "G")
("A" "A" "A") ("G" "G" "G") ("C" "C" "C") ("A" "A" "A") ("A" "A" "A") ("C" "C" "C") ("T" "T") ("A") () () () ()))
(def test-bam-mpileup-seq-ref-with-ref
'(() () () () () () (".") (".") ("." "." ".") ("." "." ".") ("." "." "C") ("." "." ".") ("." "." ".")
(".+4AGAG" ".+2GG" ".") ("." ".") ("." "." ".") ("." "." ".") (".-1G" ".+2AA" ".") ("*" ".") ("." ".")
("." ".") ("." ">") (">") (">") (">") (">") (">") (">") (">" ".") (">" ".") (">" ".") (">" ".") (">" ".")
(">") (">+1C") ;; adjacent indel >+1T
(".") ("." ".") ("." ".") ("." ".") ("." ".") (".") (".") (".") (".") (".")))
(def test-bam-mpileup-qual-ref
'([] [] [] [] [] [] [\~] [\~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~]
[\~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~] [\~ \~] [\~ \~] [\~ \~] [\~] [\~] [\~] [\~]
Expand All @@ -43,6 +51,58 @@
(fact "about first-pos"
(plp/first-pos (bam/reader test-sorted-bam-file) "ref2" 0 64) => 1)

(fact
"about pileup-seq"

;; ----------
;; 1234567890...
(map
(fn [xs] (map #(dissoc % :end) xs))
(mplp/pileup-seq 1 2 [{:pos 1 :cigar "10M"}]))
=> [[{:pos 1 :cigar "10M"}] [{:pos 1 :cigar "10M"}]]

;; ----------
;; ----------
;; ----------
;; ----------
;; 1234567890123...
(map
count
(mplp/pileup-seq 1 20 (map #(hash-map :pos (inc %) :cigar "10M") (range))))
=> [1 2 3 4 5 6 7 8 9 10 10 10 10 10 10 10 10 10 10 10]
(map
count
(mplp/pileup-seq 101 120 (map #(hash-map :pos (inc %) :cigar "10M") (range))))
=> [10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10]
(map
:pos
(last (mplp/pileup-seq 1 1000000 (map #(hash-map :pos (inc %) :cigar "10M") (range)))))
=> [999991 999992 999993 999994 999995 999996 999997 999998 999999 1000000]

;; -----
;; ----
;; ---
;; --
;; -
;; 1234567890...
(map
count
(mplp/pileup-seq 1 10 (map #(hash-map :pos (inc %) :cigar (str (inc %) "M")) (range))))
=> [1 1 2 2 3 3 4 4 5 5]

;; --------
;; ----------
;; --
;; ----
;; ------
;; --------
;; ----------
;; 1234567890...
(map
count
(mplp/pileup-seq 1 10 (map #(hash-map :pos (inc %) :cigar (str (- 10 (* (mod % 5) 2)) "M")) (range))))
=> [1 2 3 4 5 6 6 6 6 6])

(fact "about mpileup"
(with-open [br (bam/reader test-sorted-bam-file)
fr (fa/reader test-fa-file)]
Expand All @@ -54,15 +114,16 @@
(map :pos mplp-ref) => (range 1 46)
(map :seq mplp-ref) => test-bam-mpileup-seq-ref
(map :qual mplp-ref) => test-bam-mpileup-qual-ref
(map :count mplp-ref2) => test-bam-pileup-ref2)
(map :count mplp-ref2) => test-bam-pileup-ref2
(map :seq mplp-ref2) => test-bam-mpileup-seq-ref2)

(let [mplp-ref (doall (plp/mpileup fr br "ref"))]
(map :rname mplp-ref) => (repeat 45 "ref")
;; 123456789012345678901234567890123456789012345
(apply str (map :ref mplp-ref)) => "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT"
(map :count mplp-ref) => test-bam-pileup-ref
(map :pos mplp-ref) => (range 1 46)
(map :seq mplp-ref) => test-bam-mpileup-seq-ref2
(map :seq mplp-ref) => test-bam-mpileup-seq-ref-with-ref
(map :qual mplp-ref) => test-bam-mpileup-qual-ref)))

(fact "mpileup region"
Expand All @@ -75,6 +136,6 @@
(apply str (map :ref mplp-ref1)) => "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGC"
(map :count mplp-ref1) => (take 40 test-bam-pileup-ref)
(map :pos mplp-ref1) => (range 1 41)
(map :seq mplp-ref1) => (take 40 test-bam-mpileup-seq-ref2)
(map :seq mplp-ref1) => (take 40 test-bam-mpileup-seq-ref-with-ref)
(apply str (map :ref mplp-ref2)) => "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG")))