Skip to content

Commit

Permalink
Add tests for CRAM index generation
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Jul 19, 2024
1 parent 1d907ab commit 4564efb
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 57 deletions.
33 changes: 22 additions & 11 deletions src/cljam/io/crai.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,33 @@
[clojure.string :as str])
(:import [java.io Closeable OutputStreamWriter PrintWriter]))

(defn- read-index-entries [rdr]
(->> (line-seq rdr)
(map (fn [line]
(let [[^long seq-id start span container-offset slice-offset size]
(map #(Long/parseLong %) (str/split line #"\t"))
unmapped? (neg? seq-id)]
{:ref-seq-id seq-id
:start (if unmapped? 0 start)
:span (if unmapped? 0 span)
:container-offset container-offset
:slice-offset slice-offset
:size size})))))

(defn read-index
"Reads a CRAI file `f` and creates an index."
[f refs]
(let [refs (vec refs)]
(with-open [rdr (io/reader (util/compressor-input-stream f))]
(->> (line-seq rdr)
(map (fn [line]
(let [[^long seq-id ^long start ^long span container-offset slice-offset size]
(map #(Long/parseLong %) (str/split line #"\t"))
unmapped? (neg? seq-id)]
{:chr (if unmapped? "*" (:name (nth refs seq-id)))
:start (if unmapped? 0 start)
:end (if unmapped? 0 (+ start span))
:container-offset container-offset
:slice-offset slice-offset
:size size})))
(->> (read-index-entries rdr)
(map (fn [{:keys [^long ref-seq-id ^long start ^long span] :as entry}]
(-> entry
(assoc :chr
(if (neg? ref-seq-id)
"*"
(:name (nth refs ref-seq-id)))
:end (+ start span))
(dissoc :ref-seq-id :span))))
intervals/index-intervals))))

(defn find-overlapping-entries
Expand Down
35 changes: 35 additions & 0 deletions test/cljam/algo/cram_indexer_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
(ns cljam.algo.cram-indexer-test
(:require [cljam.algo.cram-indexer :as indexer]
[cljam.io.crai :as crai]
[cljam.io.cram :as cram]
[cljam.test-common :as common]
[cljam.util :as util]
[clojure.java.io :as io]
[clojure.test :refer [deftest is]]))

(defn- read-index-entries [f]
(with-open [r (io/reader (util/compressor-input-stream f))]
(doall (#'crai/read-index-entries r))))

(deftest create-index-test
(let [f (io/file common/temp-dir "medium.cram.crai")]
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(is (thrown-with-msg? Exception #"Cannot create CRAM index file .*"
(indexer/create-index common/medium-cram-file f)))
(indexer/create-index common/medium-cram-file f
:skip-sort-order-check? true)
(is (= (read-index-entries common/medium-crai-file)
(read-index-entries f))))))

(deftest cram-without-alignments-test
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(let [header {:HD {:SO "coordinate"}
:SQ [{:SN "chr1", :LN 100}]}
target (io/file common/temp-dir "no_aln.cram")
target-crai (io/file common/temp-dir "no_aln.cram.crai")]
(with-open [w (cram/writer target)]
(cram/write-header w header)
(cram/write-refs w header))
(is (common/not-throw? (indexer/create-index target target-crai))))))
25 changes: 24 additions & 1 deletion test/cljam/io/crai_test.clj
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
(ns cljam.io.crai-test
(:require [cljam.io.crai :as crai]
[cljam.test-common :as common]
[clojure.test :refer [deftest are]]))
[cljam.util :as util]
[clojure.java.io :as io]
[clojure.test :refer [are deftest is]]))

(def ^:private test-refs
(->> (concat (range 1 23) ["X" "Y"])
Expand Down Expand Up @@ -116,3 +118,24 @@
:container-offset 378365
:slice-offset 190037
:size 12326}])))

(deftest write-index-entries-test
(let [entries [{:ref-seq-id 0, :start 546609, :span 205262429,
:container-offset 324, :slice-offset 563, :size 22007}
{:ref-seq-id 0, :start 206547069, :span 42644506,
:container-offset 324, :slice-offset 22570, :size 7349}
{:ref-seq-id 1, :start 67302, :span 231638920,
:container-offset 30272, :slice-offset 563, :size 21618}
{:ref-seq-id -1, :start 0, :span 0,
:container-offset 354657, :slice-offset 563, :size 23119}
{:ref-seq-id -1, :start 0, :span 0,
:container-offset 378365, :slice-offset 171, :size 23494}
{:ref-seq-id -1, :start 0, :span 0,
:container-offset 378365, :slice-offset 23665, :size 23213}]
f (io/file common/temp-dir "test.cram.crai")]
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(with-open [w (crai/writer f)]
(crai/write-index-entries w entries))
(with-open [r (io/reader (util/compressor-input-stream f))]
(is (= entries (#'crai/read-index-entries r)))))))
100 changes: 55 additions & 45 deletions test/cljam/io/cram/encode/record_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@
:RG
[{:ID "rg001"}
{:ID "rg002"}]}
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict [[]
[{:tag :MD, :type \Z}
{:tag :NM, :type \c}]]
Expand All @@ -87,34 +90,36 @@
:NM {\c {:codec :byte-array-len
:len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]}
:val-encoding {:codec :external, :content-id 5131619}}}})
records [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0,
:cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/tags-index 1}
{:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15,
:cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60,
:cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
:cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/tags-index 1}
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options [], ::record/tags-index 0}]
stats (record/encode-slice-records test-seq-resolver cram-header tag-dict subst-mat
records (object-array
[{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0,
:cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/tags-index 1}
{:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15,
:cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60,
:cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
:cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/tags-index 1}
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options [], ::record/tags-index 0}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
stats (record/encode-slice-records cram-header rname->idx tag-dict
ds-encoders tag-encoders records)
ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)
tag-res (walk/prewalk #(if (fn? %) (%) %) tag-encoders)]
Expand Down Expand Up @@ -269,25 +274,30 @@
(let [cram-header {:SQ
[{:SN "ref"}
{:SN "ref2"}]}
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict [[]]
ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings)
records [{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF"
:options [], ::record/tags-index 0}
{:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD"
:options [], ::record/tags-index 0}
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options [], ::record/tags-index 0}]
stats (record/encode-slice-records test-seq-resolver cram-header tag-dict
subst-mat ds-encoders {} records)
records (object-array
[{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF"
:options [], ::record/tags-index 0}
{:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD"
:options [], ::record/tags-index 0}
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options [], ::record/tags-index 0}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
stats (record/encode-slice-records cram-header rname->idx tag-dict
ds-encoders {} records)
ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)]
(is (= {:ri -1, :start 0, :end 0, :nbases 25, :nrecords 5}
(into {} stats)))
Expand Down

0 comments on commit 4564efb

Please sign in to comment.