diff --git a/src/cljam/cigar.clj b/src/cljam/cigar.clj index bd22ec53..1ca5d2c3 100644 --- a/src/cljam/cigar.clj +++ b/src/cljam/cigar.clj @@ -8,6 +8,45 @@ (for [[_ n op] (re-seq #"([0-9]*)([MIDNSHP=X])" s)] [(Integer/parseInt n) (first op)])) +(defn simplify + "Merge contiguous same operations of parsed CIGAR." + [cigs] + (loop [[[^long l op :as x] & xs] cigs result (transient [])] + (if (and l op) + (let [[^long nl nop] (first xs)] + (if (= op nop) + (recur (cons [(+ l nl) op] (next xs)) result) + (recur xs (conj! result x)))) + (persistent! result)))) + +(defn- concat! [v coll] + (reduce conj! v coll)) + +(defn- update-last! [coll f] + (let [c (dec (count coll))] + (if (neg? c) + coll + (let [[op x] (get coll c)] + (if (= :m op) + (assoc! coll c (f x)) + coll))))) + +(defn to-index* + "Convert CIGAR string to sequence of indices." + [^String s] + (let [cigs (simplify (remove (comp #{\P \H} second) (parse s)))] + (loop [[[^long l op] & xs] cigs r 0 s 0 idx (transient [])] + (if (and l op) + (condp get op + #{\M \= \X} (recur xs (+ l r) (+ l s) (concat! idx (map (fn [x] [:m x]) (range s (+ l s))))) + #{\D} (recur xs (+ r l) s (concat! (update-last! idx (fn [x] [:d x (range r (+ l r))])) (repeat l [:m \*]))) + #{\N} (recur xs (+ r l) s (concat! idx (repeat l [:m \>]))) + #{\S} (recur xs r (+ s l) idx) + #{\I} (recur xs r (+ s l) (update-last! idx (fn [x] [:i x (range s (+ l s))])))) + (persistent! idx))))) + +(def to-index (memoize to-index*)) + (defn count-op "Returns length of CIGAR operations." [^String s] diff --git a/src/cljam/cli.clj b/src/cljam/cli.clj index 0601bebb..e0bafa7b 100644 --- a/src/cljam/cli.clj +++ b/src/cljam/cli.clj @@ -244,7 +244,7 @@ :seq (cstr/join (% line)) (% line)) [:rname :pos :ref :count :seq :qual])))))) ([rdr rname start end] - (doseq [line (plp/mpileup rdr rname start end)] + (doseq [line (plp/mpileup nil rdr rname start end)] (if-not (zero? (:count line)) (println (cstr/join \tab (map #(case % :qual (cstr/join (% line)) diff --git a/src/cljam/pileup/mpileup.clj b/src/cljam/pileup/mpileup.clj index a405a18d..3e89f0c6 100644 --- a/src/cljam/pileup/mpileup.clj +++ b/src/cljam/pileup/mpileup.clj @@ -7,72 +7,77 @@ [cljam.sequence :as cseq] [cljam.io :as io] [cljam.fasta :as fa] + [cljam.cigar :as cig] [cljam.pileup.common :refer [window-width step center]] [cljam.pileup.pileup :refer [rpositions]])) -(defn- append-seq - [op target current] - (case op - \M (apply conj current (map str (:seq target))) - \I (if (seq current) - (update-in current [(dec (count current))] - vector - (str "+" (:n target) (apply str (:seq target)))) - current) - \D (apply conj current (map str (:seq target))) - \N (apply conj current (map str (:seq target))) - current)) +(defn to-mpileup + "Stringify mpileup sequence." + [x] + (if (vector? x) + (let [[y op xs] x] (apply str y op (count xs) xs)) + (str x))) -(defn encode-seq [seq*] - (loop [[f & r] (filter #(nil? (#{\P} (:op %))) seq*) - ret [] - op nil - tmp {:n 0, :op nil, :seq nil}] - (if (nil? f) - (append-seq op tmp ret) - (if (nil? op) - (recur r ret (:op f) f) - (if (= (:op f) op) - (recur r ret (:op f) (-> tmp - (update-in [:n] + (:n f)) - (assoc :op (:op f)) - (update-in [:seq] (partial apply conj) (:seq f)))) - (let [new-ret (append-seq op tmp ret)] - (recur r new-ret (:op f) f))))))) +(defn substitute-seq + "Substitute sequence with mpileup index." + [[^Character r & refs] ^String s [op x xs]] + (letfn [(get-char [i] (if (number? i) (let [c (.charAt s i)] (if (= c r) \. c)) i))] + (case op + :m (get-char x) + :d [(get-char x) \- (take (count xs) refs)] + :i [(get-char x) \+ (map #(.charAt s %) xs)] + x))) + +(defn substitute-qual + "Substitute base quality with mpileup index." + [^String q [op x]] + (if (and (number? x) (not= q "*")) (.charAt q x) \~)) (defn pileup-seq "Returns a lazy sequence that each element contains reads piled-up at the locus." [^long start ^long end reads] (letfn [(cover-locus? [pos aln] - (<= (:pos aln) pos (sam-util/get-end aln))) - (step [[pos buf alns]] - (let [[i o] (split-with (partial cover-locus? (inc pos)) alns)] - [(inc pos) (concat (filter (partial cover-locus? (inc pos)) buf) i) o]))] - (map second (rest (iterate step [(dec start) [] (remove (comp empty? :cigar) reads)]))))) + (<= (:pos aln) pos (:end aln))) + (step [pos buf alns] + (lazy-seq + (when (< pos end) + (let [[i o] (split-with (partial cover-locus? (inc pos)) alns) + b (doall (concat (filter #(<= (inc pos) (:end %)) buf) i))] + (cons b (step (inc pos) b o))))))] + (->> reads + (sequence + (comp + (remove #(and (empty? (:cigar %)) (nil? (:cigar-bytes (:meta %))))) + (map #(assoc % :end (sam-util/get-end %))) + (drop-while #(< (:end %) start)))) + (step (dec start) [])))) (defrecord MPileupElement [^String rname ^long pos ^Character ref ^long count seq qual reads]) -(defn- gen-mpileup +(defn gen-mpileup "Compute mpileup info from piled-up reads and reference." - [^String rname ^long locus ^Character reference reads] - (let [seq (mapv (fn [{:keys [^long pos ^String seq ^String cigar] :as aln}] - (let [s (nth (encode-seq (cseq/parse seq cigar)) (- locus pos))] - (cond - (vector? s) (second s) - (= reference (first s)) "." - :else s))) reads) - qual (mapv (fn [{:keys [^long pos ^String qual] :as aln}] - (if (= qual "*") \~ (nth qual (- locus pos) \~))) reads)] - (MPileupElement. rname locus reference (count reads) seq qual reads))) + [^String rname ^long locus [^Character ref-base :as refs] reads] + (let [seqs (map (fn gen-mpileup-seq [{:keys [^long pos ^String seq cig-index]}] + (substitute-seq refs seq (nth cig-index (- locus pos)))) reads) + qual (map (fn gen-mpileup-qual [{:keys [^long pos ^String qual cig-index]}] + (substitute-qual qual (nth cig-index (- locus pos)))) reads)] + (MPileupElement. rname locus ref-base (count reads) seqs qual reads))) (defn pileup* - "Internal pileup function independent from I/O." - [refseq alns rname start end] - (map - (partial gen-mpileup rname) - (range start (inc end)) - refseq - (pileup-seq start end alns))) + "Internal mpileup function independent from I/O. + Can take multiple alignments seqs." + [refseq rname start end & aln-seqs] + (->> aln-seqs + (map + (fn [alns] + (->> alns + (sequence (map (fn [a] (assoc a :cig-index (cig/to-index (:cigar a)))))) + (pileup-seq start end)))) + (apply map + (fn [index refs & plps] + (map (fn [plp] (gen-mpileup rname index refs plp)) plps)) + (range start (inc end)) + (partition 10 1 (concat refseq (repeat \N)))))) (defn pileup "Returns a lazy sequence of MPileupElement calculated from FASTA and BAM." @@ -89,7 +94,7 @@ (fa/read-sequence fa-reader {:chr rname :start s :end e}) (repeat \N)) alns (io/read-alignments bam-reader {:chr rname :start s :end e :depth :deep})] - (pileup* refseq alns rname s e))) + (map (fn [p] (update (first p) :seq (fn [s] (map to-mpileup s)))) (pileup* refseq rname s e alns)))) (catch bgzf4j.BGZFException _ (throw (RuntimeException. "Invalid file format")))))) diff --git a/test/cljam/t_cigar.clj b/test/cljam/t_cigar.clj index 8cce60fa..3218f3b4 100644 --- a/test/cljam/t_cigar.clj +++ b/test/cljam/t_cigar.clj @@ -4,3 +4,39 @@ (fact "about parse" (cgr/parse "1S2I6M1P11I") => '([1 \S] [2 \I] [6 \M] [1 \P] [11 \I])) + +(tabular + (fact + "cigar to index" + (map (fn [[op x :as xs]] (if (= op :m) x xs)) (cgr/to-index* ?cigar)) => ?index) + ?cigar ?index + "4M" [0 1 2 3] + "1M3I" [[:i 0 [1 2 3]]] + "1M3D" [[:d 0 [1 2 3]] \* \* \*] + "2M3I" [0 [:i 1 [2 3 4]]] + "4D5M" [\* \* \* \* 0 1 2 3 4] ;; ^]* * * * A T G C A$ + "4D5I" [\* \* \* [:i \* [0 1 2 3 4]]] ;; TODO: ^]* * * *+5TGCA=$ + "4S4M" [4 5 6 7] + "4H4M" [0 1 2 3] + "1M3I1M" [[:i 0 [1 2 3]] 4] + "2M3I1M" [0 [:i 1 [2 3 4]] 5] + "4I2D5M" [\* \* 4 5 6 7 8] ;; ^]* * A T G C A$ + "1M4D5I" [[:d 0 [1 2 3 4]] \* \* \* [:i \* [1 2 3 4 5]]] ;; TODO: ^]* * * *+5TGCA=$ + "1M3D3M" [[:d 0 [1 2 3]] \* \* \* 1 2 3] + "1M2D1I2M" [[:d 0 [1 2]] \* [:i \* [1]] 2 3] ;; TODO: ^]A-2NN * *+1G G C$ + "1M2N1I2M" [0 \> [:i \> [1]] 2 3] ;; TODO: ^]A > >+1G G C$ + "1M1I2D2M" [[:i 0 [1]] \* \* 2 3] ;; ^]A+1T * * G C$ + "1M1I2N2M" [[:i 0 [1]] \> \> 2 3] ;; ^]A+1T > > G C$ + "1M4I2D5M" [[:i 0 [1 2 3 4]] \* \* 5 6 7 8 9] ;; ^]A+4TGCA * * C A T G C$ + "1S4I2D5M" [\* \* 5 6 7 8 9] ;; ^]* * T G C A T$ + "3M2P2I3M" [0 1 [:i 2 [3 4]] 5 6 7] ;; ^]A T G+2CA T G C$ + "1P4I1D5M" [\* 4 5 6 7 8] ;; ^]* A T G C A$ + "5M1D3M4S" [0 1 2 3 [:d 4 [5]] \* 5 6 7] ;; ^]A T G C A-1N * T G C$ + "5M1N3M4S" [0 1 2 3 4 \> 5 6 7] ;; ^]A T G C A > T G C$ + "3H2S3M1D2S4H" [2 3 [:d 4 [3]] \*] ;; ^]G C A-1N \*$ + "1M1P1I1P1I2M" [[:i 0 [1 2]] 3 4] ;; ^]A+2TG C A$ + "2M1I1D2I2D1M" [0 [:i 1 [2]] [:i \* [3 4]] \* \* 5] ;; TODO: ^]A T+1G *+2AT * * T$ + "1S2I6M1P1I1P1I4M2I" [3 4 5 6 7 [:i 8 [9 10]] 11 12 13 [:i 14 [15 16]]] ;; ^]C A T G C A+2TG C A T G+2CA$ + "6M14D1I5M" [0 1 2 3 4 [:d 5 [6 7 8 9 10 11 12 13 14 15 16 17 18 19]] \* \* \* \* \* \* \* \* \* \* \* \* \* [:i \* [6]] 7 8 9 10 11] ;; ^]A T G C A T-14NNNNNNNNNNNNNN * * * * * * * * * * * * * *+1C C A T G C$ + "6M14N1I5M" [0 1 2 3 4 5 \> \> \> \> \> \> \> \> \> \> \> \> \> [:i \> [6]] 7 8 9 10 11] ;; ^]A T G C A T > > > > > > > > > > > > > >+1C C A T G C$ + "1S2M2D3M1D4M3D2M3D3M3I4M3D2M3S" [1 [:d 2 [2 3]] \* \* 3 4 [:d 5 [7]] \* 6 7 8 [:d 9 [12 13 14]] \* \* \* 10 [:d 11 [17 18 19]] \* \* \* 12 13 [:i 14 [15 16 17]] 18 19 20 [:d 21 [27 28 29]] \* \* \* 22 23]) diff --git a/test/cljam/t_pileup.clj b/test/cljam/t_pileup.clj index 371e10d1..7c001466 100644 --- a/test/cljam/t_pileup.clj +++ b/test/cljam/t_pileup.clj @@ -3,22 +3,30 @@ cljam.t-common) (:require [cljam.bam :as bam] [cljam.fasta :as fa] - [cljam.pileup :as plp])) + [cljam.pileup :as plp] + [cljam.pileup.mpileup :as mplp])) (def test-bam-pileup-ref '(0 0 0 0 0 0 1 1 3 3 3 3 3 3 2 3 3 3 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 2 2 2 1 1 1 1 1)) (def test-bam-pileup-ref2 '(1 2 2 2 2 3 3 3 3 4 4 5 5 6 6 6 6 6 6 6 5 5 4 4 4 4 4 3 3 3 3 3 3 3 2 1 0 0 0 0)) (def test-bam-mpileup-seq-ref - '(() () () () () () ("T") ("T") ("A" "A" "A") ("G" "G" "G") ("A" "A" "C") ("T" "T" "T") - ("A" "A" "A") ("+4AGAG" "+2GG" "A") ("G" "G") ("A" "A" "A") ("T" "T" "T") ("A" "+2AA" "A") - ("*" "G") ("C" "C") ("T" "T") ("G" ">") (">") (">") (">") (">") (">") (">") (">" "T") (">" "A") - (">" "G") (">" "G") (">" "C") (">") ("+1C") ("T") ("C" "C") ("A" "A") ("G" "G") ("C" "C") - ("G") ("C") ("C") ("A") ("T"))) + '(() () () () () () ("T") ("T") ("A" "A" "A") ("G" "G" "G") ("A" "A" "C") ("T" "T" "T") ("A" "A" "A") + ("A+4AGAG" "A+2GG" "A") ("G" "G") ("A" "A" "A") ("T" "T" "T") ("A-1N" "A+2AA" "A") ("*" "G") ("C" "C") + ("T" "T") ("G" ">") (">") (">") (">") (">") (">") (">") (">" "T") (">" "A") (">" "G") (">" "G") (">" "C") + (">") (">+1C") ;; adjacent indel >+1T + ("T") ("C" "C") ("A" "A") ("G" "G") ("C" "C") ("G") ("C") ("C") ("A") ("T"))) (def test-bam-mpileup-seq-ref2 - '(() () () () () () (".") (".") ("." "." ".") ("." "." ".") ("." "." "C") ("." "." ".") - ("." "." ".") ("+4AGAG" "+2GG" ".") ("." ".") ("." "." ".") ("." "." ".") ("." "+2AA" ".") - ("*" ".") ("." ".") ("." ".") ("." ">") (">") (">") (">") (">") (">") (">") (">" ".") (">" ".") - (">" ".") (">" ".") (">" ".") (">") ("+1C") (".") ("." ".") ("." ".") ("." ".") ("." ".") - (".") (".") (".") (".") ("."))) + '(("A") ("G" "G") ("G" "G") ("T" "T") ("T" "T") ("T" "T" "T") ("T" "T" "T") ("A" "A" "A") ("T" "T" "T") + ("A" "A" "A" "C") ("A" "A" "A" "A") ("A" "A" "A" "A" "A") ("A" "A" "A" "A" "A") ("C" "C" "C+4AAAT" "T" "T" "T") + ("A" "A" "A" "A" "A" "A") ("A" "A" "A" "A" "A" "A") ("A" "A" "T" "T" "T" "T") ("T" "T" "T" "T" "T" "T") + ("A" "A" "A" "A" "A" "A") ("A" "A" "A" "A" "A" "A") ("T" "G" "G" "G" "G") ("T" "T" "T" "T" "T") + ("C" "C" "C" "C") ("T" "T" "T" "T") ("A" "A" "A" "A") ("C" "C" "C" "C") ("A" "A" "A" "A") ("G" "G" "G") + ("A" "A" "A") ("G" "G" "G") ("C" "C" "C") ("A" "A" "A") ("A" "A" "A") ("C" "C" "C") ("T" "T") ("A") () () () ())) +(def test-bam-mpileup-seq-ref-with-ref + '(() () () () () () (".") (".") ("." "." ".") ("." "." ".") ("." "." "C") ("." "." ".") ("." "." ".") + (".+4AGAG" ".+2GG" ".") ("." ".") ("." "." ".") ("." "." ".") (".-1G" ".+2AA" ".") ("*" ".") ("." ".") + ("." ".") ("." ">") (">") (">") (">") (">") (">") (">") (">" ".") (">" ".") (">" ".") (">" ".") (">" ".") + (">") (">+1C") ;; adjacent indel >+1T + (".") ("." ".") ("." ".") ("." ".") ("." ".") (".") (".") (".") (".") ("."))) (def test-bam-mpileup-qual-ref '([] [] [] [] [] [] [\~] [\~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~ \~] [\~ \~] [\~ \~] [\~ \~] [\~ \~] [\~] [\~] [\~] [\~] @@ -43,6 +51,58 @@ (fact "about first-pos" (plp/first-pos (bam/reader test-sorted-bam-file) "ref2" 0 64) => 1) +(fact + "about pileup-seq" + + ;; ---------- + ;; 1234567890... + (map + (fn [xs] (map #(dissoc % :end) xs)) + (mplp/pileup-seq 1 2 [{:pos 1 :cigar "10M"}])) + => [[{:pos 1 :cigar "10M"}] [{:pos 1 :cigar "10M"}]] + + ;; ---------- + ;; ---------- + ;; ---------- + ;; ---------- + ;; 1234567890123... + (map + count + (mplp/pileup-seq 1 20 (map #(hash-map :pos (inc %) :cigar "10M") (range)))) + => [1 2 3 4 5 6 7 8 9 10 10 10 10 10 10 10 10 10 10 10] + (map + count + (mplp/pileup-seq 101 120 (map #(hash-map :pos (inc %) :cigar "10M") (range)))) + => [10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10] + (map + :pos + (last (mplp/pileup-seq 1 1000000 (map #(hash-map :pos (inc %) :cigar "10M") (range))))) + => [999991 999992 999993 999994 999995 999996 999997 999998 999999 1000000] + + ;; ----- + ;; ---- + ;; --- + ;; -- + ;; - + ;; 1234567890... + (map + count + (mplp/pileup-seq 1 10 (map #(hash-map :pos (inc %) :cigar (str (inc %) "M")) (range)))) + => [1 1 2 2 3 3 4 4 5 5] + + ;; -------- + ;; ---------- + ;; -- + ;; ---- + ;; ------ + ;; -------- + ;; ---------- + ;; 1234567890... + (map + count + (mplp/pileup-seq 1 10 (map #(hash-map :pos (inc %) :cigar (str (- 10 (* (mod % 5) 2)) "M")) (range)))) + => [1 2 3 4 5 6 6 6 6 6]) + (fact "about mpileup" (with-open [br (bam/reader test-sorted-bam-file) fr (fa/reader test-fa-file)] @@ -54,7 +114,8 @@ (map :pos mplp-ref) => (range 1 46) (map :seq mplp-ref) => test-bam-mpileup-seq-ref (map :qual mplp-ref) => test-bam-mpileup-qual-ref - (map :count mplp-ref2) => test-bam-pileup-ref2) + (map :count mplp-ref2) => test-bam-pileup-ref2 + (map :seq mplp-ref2) => test-bam-mpileup-seq-ref2) (let [mplp-ref (doall (plp/mpileup fr br "ref"))] (map :rname mplp-ref) => (repeat 45 "ref") @@ -62,7 +123,7 @@ (apply str (map :ref mplp-ref)) => "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT" (map :count mplp-ref) => test-bam-pileup-ref (map :pos mplp-ref) => (range 1 46) - (map :seq mplp-ref) => test-bam-mpileup-seq-ref2 + (map :seq mplp-ref) => test-bam-mpileup-seq-ref-with-ref (map :qual mplp-ref) => test-bam-mpileup-qual-ref))) (fact "mpileup region" @@ -75,6 +136,6 @@ (apply str (map :ref mplp-ref1)) => "AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGC" (map :count mplp-ref1) => (take 40 test-bam-pileup-ref) (map :pos mplp-ref1) => (range 1 41) - (map :seq mplp-ref1) => (take 40 test-bam-mpileup-seq-ref2) + (map :seq mplp-ref1) => (take 40 test-bam-mpileup-seq-ref-with-ref) (apply str (map :ref mplp-ref2)) => "AGGTTTTATAAAACAATTAAGTCTACAGAGCAACTACGCG")))