From 90b188a0b3afb0743f3860cdbce190b7ca3d6cfe Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 30 Sep 2024 21:13:25 +0800 Subject: [PATCH] update --- docs/api.rst | 18 ++++++++ precellar/src/align.rs | 3 +- precellar/src/io.rs | 7 +++ python/src/pyseqspec.rs | 71 +++++++++++++++++++++++++---- python/src/utils.rs | 33 ++++++++++++++ seqspec_templates/10x_rna_atac.yaml | 2 +- 6 files changed, 124 insertions(+), 10 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index e0ae857..7f562df 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -7,12 +7,30 @@ methods. .. currentmodule:: precellar +SeqSpec +~~~~~~~ + .. autosummary:: :toctree: _autosummary SeqSpec + SeqSpec.add_read + SeqSpec.to_yaml + +Core functions +~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: _autosummary make_genome_index align make_fragment +Utilities +~~~~~~~~~ + +.. autosummary:: + :toctree: _autosummary + + utils.strip_barcode_from_fastq \ No newline at end of file diff --git a/precellar/src/align.rs b/precellar/src/align.rs index 184270e..e5f5ca1 100644 --- a/precellar/src/align.rs +++ b/precellar/src/align.rs @@ -216,7 +216,8 @@ impl FastqProcessor { let mut whitelist = self.get_whitelist()?; let (read, index) = self.assay.get_index_of(modality).into_iter() - .find(|(_, index)| index.into_iter().any(|x| x.0.region_type.is_barcode())).unwrap(); + .find(|(_, index)| index.into_iter().any(|x| x.0.region_type.is_barcode())) + .expect("No barcode region found"); let range = index.into_iter().find(|x| x.0.region_type.is_barcode()).unwrap().1; crate::io::read_fastq(read, &self.base_dir).records().for_each(|record| { diff --git a/precellar/src/io.rs b/precellar/src/io.rs index 260009f..1a61815 100644 --- a/precellar/src/io.rs +++ b/precellar/src/io.rs @@ -92,6 +92,13 @@ pub fn read_fastq>(read: &seqspec::Read, base_dir: P) -> fastq::R fastq::Reader::new(BufReader::new(reader)) } +pub fn get_read_length>(read: &seqspec::Read, base_dir: P) -> Result { + let mut reader = read_fastq(read, base_dir); + let mut record = fastq::Record::default(); + reader.read_record(&mut record)?; + Ok(record.sequence().len()) +} + pub fn read_onlist(onlist: &seqspec::Onlist) -> Result> { let cache = Cache::new()?; let file = cache.cached_path(&onlist.url)?; diff --git a/python/src/pyseqspec.rs b/python/src/pyseqspec.rs index ae886e7..0dd8592 100644 --- a/python/src/pyseqspec.rs +++ b/python/src/pyseqspec.rs @@ -1,4 +1,4 @@ -use std::{path::PathBuf, str::FromStr}; +use std::{collections::HashMap, path::PathBuf, str::FromStr}; use pyo3::prelude::*; use seqspec::{Assay, File, Modality, Read, Region, Strand, UrlType}; @@ -53,9 +53,13 @@ impl SeqSpec { /// Whether the read is reverse. /// fastq: Path | list[Path] /// The path to the fastq file containing the reads. + /// min_len: int + /// The minimum length of the read. If not provided, the minimum length is inferred from the fastq file. + /// max_len: int + /// The maximum length of the read. If not provided, the maximum length is inferred from the fastq file. #[pyo3( - signature = (read_id, *, modality, primer_id, is_reverse, fastq), - text_signature = "($self, read_id, *, modality, primer_id, is_reverse, fastq)", + signature = (read_id, *, modality, primer_id, is_reverse, fastq, min_len=None, max_len=None), + text_signature = "($self, read_id, *, modality, primer_id, is_reverse, fastq, min_len=None, max_len=None)", )] pub fn add_read( &mut self, @@ -64,6 +68,8 @@ impl SeqSpec { primer_id: &str, is_reverse: bool, fastq: Bound<'_, PyAny>, + min_len: Option, + max_len: Option, ) -> Result<()> { let fastq = if fastq.is_instance_of::() { fastq.extract::>()? @@ -82,10 +88,20 @@ impl SeqSpec { } else { read.strand = Strand::Pos; } + read.read_id = read_id.to_string(); read.modality = Modality::from_str(modality)?; read.primer_id = primer_id.to_string(); read.files = Some(fastq.into_iter().map(|path| make_file_path(path)).collect::>>()?); + if min_len.is_none() || max_len.is_none() { + let len = precellar::io::get_read_length(&read, "./")?; + read.min_len = min_len.unwrap_or(len) as u32; + read.max_len = max_len.unwrap_or(len) as u32; + } else { + read.min_len = min_len.unwrap() as u32; + read.max_len = max_len.unwrap() as u32; + } + reads.push(read); assay.sequence_spec = Some(reads); Ok(()) @@ -112,16 +128,55 @@ impl SeqSpec { fn __repr__(&self) -> String { let assay = &self.0; - let tree = Tree::new("".to_string()) - .with_leaves(assay.library_spec.as_ref().unwrap_or(&Vec::new()).iter().map(|region| build_tree(region))); + let mut read_list = HashMap::new(); + if let Some(reads) = assay.sequence_spec.as_ref() { + for read in reads { + read_list.entry(read.primer_id.clone()).or_insert(Vec::new()).push(read); + } + } + + let tree = Tree::new("".to_string()).with_leaves( + assay.library_spec.as_ref() + .unwrap_or(&Vec::new()).iter() + .map(|region| build_tree(region, &read_list)) + ); format!("{}", tree) } + + fn __str__(&self) -> String { + self.__repr__() + } } -fn build_tree(region: &Region) -> Tree { - Tree::new(region.region_id.clone()) +fn build_tree(region: &Region, read_list: &HashMap>) -> Tree { + let id = ®ion.region_id; + let len = if region.min_len == region.max_len { + region.min_len.to_string() + } else { + format!("{}-{}", region.min_len, region.max_len) + }; + let label = if let Some(reads) = read_list.get(id) { + let s = reads.iter().map(|r| format_read(r)).collect::>().join(", "); + format!("{}({}) [{}]", id, len, s) + } else { + format!("{}({})", id, len) + }; + Tree::new(label) .with_leaves(region.regions.as_ref().unwrap_or(&Vec::new()) - .iter().map(|child| build_tree(child))) + .iter().map(|child| build_tree(child, read_list))) +} + +fn format_read(read: &Read) -> String { + let len = if read.min_len == read.max_len { + read.min_len.to_string() + } else { + format!("{}-{}", read.min_len, read.max_len) + }; + if read.is_reverse() { + format!("↑{}({})", read.read_id, len) + } else { + format!("↓{}({})", read.read_id, len) + } } fn make_file_path(path: PathBuf) -> Result { diff --git a/python/src/utils.rs b/python/src/utils.rs index 6c2e9f6..b52c000 100644 --- a/python/src/utils.rs +++ b/python/src/utils.rs @@ -5,6 +5,39 @@ use anyhow::Result; use pyo3::prelude::*; use regex::Regex; +/// Remove barcode from the read names of fastq records. +/// +/// The old practice of storing barcodes in read names is not recommended. This +/// function extracts barcodes from read names using regular expressions and +/// writes them to a separate fastq file. +/// +/// Parameters +/// ---------- +/// in_fq: Path +/// File path to the input fastq file. +/// out_fq: Path +/// File path to the output fastq file. +/// regex: str +/// Extract barcodes from read names of BAM records using regular expressions. +/// Reguler expressions should contain exactly one capturing group +/// (Parentheses group the regex between them) that matches +/// the barcodes. For example, `barcode_regex="(..:..:..:..):\\\\w+$"` +/// extracts `bd:69:Y6:10` from +/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`. +/// You can test your regex on this website: https://regex101.com/. +/// out_barcode: Path | None +/// File path to the output fastq file containing the extracted barcodes. +/// If None, the barcodes will not be written to a file. +/// left_add: int +/// Additional bases to strip from the left side of the barcode. +/// right_add: int +/// Additional bases to strip from the right side of the barcode. +/// compression: str | None +/// Compression algorithm to use. If None, the compression algorithm will be inferred from the file extension. +/// compression_level: int | None +/// Compression level to use. +/// num_threads: int +/// The number of threads to use. #[pyfunction] #[pyo3( signature = (in_fq, out_fq, diff --git a/seqspec_templates/10x_rna_atac.yaml b/seqspec_templates/10x_rna_atac.yaml index 7ba507e..601149c 100644 --- a/seqspec_templates/10x_rna_atac.yaml +++ b/seqspec_templates/10x_rna_atac.yaml @@ -187,7 +187,7 @@ library_spec: regions: null - !Region parent_id: atac - region_id: gDNA + region_id: atac-gDNA region_type: gdna name: gDNA sequence_type: random