Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Sep 30, 2024
1 parent 7487d84 commit 90b188a
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 10 deletions.
18 changes: 18 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,30 @@ methods.

.. currentmodule:: precellar

SeqSpec
~~~~~~~

.. autosummary::
:toctree: _autosummary

SeqSpec
SeqSpec.add_read
SeqSpec.to_yaml

Core functions
~~~~~~~~~~~~~~

.. autosummary::
:toctree: _autosummary

make_genome_index
align
make_fragment

Utilities
~~~~~~~~~

.. autosummary::
:toctree: _autosummary

utils.strip_barcode_from_fastq
3 changes: 2 additions & 1 deletion precellar/src/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ impl<A: Alinger> FastqProcessor<A> {
let mut whitelist = self.get_whitelist()?;

let (read, index) = self.assay.get_index_of(modality).into_iter()
.find(|(_, index)| index.into_iter().any(|x| x.0.region_type.is_barcode())).unwrap();
.find(|(_, index)| index.into_iter().any(|x| x.0.region_type.is_barcode()))
.expect("No barcode region found");
let range = index.into_iter().find(|x| x.0.region_type.is_barcode()).unwrap().1;

crate::io::read_fastq(read, &self.base_dir).records().for_each(|record| {
Expand Down
7 changes: 7 additions & 0 deletions precellar/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ pub fn read_fastq<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> fastq::R
fastq::Reader::new(BufReader::new(reader))
}

pub fn get_read_length<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> Result<usize> {
let mut reader = read_fastq(read, base_dir);
let mut record = fastq::Record::default();
reader.read_record(&mut record)?;
Ok(record.sequence().len())
}

pub fn read_onlist(onlist: &seqspec::Onlist) -> Result<Vec<String>> {
let cache = Cache::new()?;
let file = cache.cached_path(&onlist.url)?;
Expand Down
71 changes: 63 additions & 8 deletions python/src/pyseqspec.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{path::PathBuf, str::FromStr};
use std::{collections::HashMap, path::PathBuf, str::FromStr};

use pyo3::prelude::*;
use seqspec::{Assay, File, Modality, Read, Region, Strand, UrlType};
Expand Down Expand Up @@ -53,9 +53,13 @@ impl SeqSpec {
/// Whether the read is reverse.
/// fastq: Path | list[Path]
/// The path to the fastq file containing the reads.
/// min_len: int
/// The minimum length of the read. If not provided, the minimum length is inferred from the fastq file.
/// max_len: int
/// The maximum length of the read. If not provided, the maximum length is inferred from the fastq file.
#[pyo3(
signature = (read_id, *, modality, primer_id, is_reverse, fastq),
text_signature = "($self, read_id, *, modality, primer_id, is_reverse, fastq)",
signature = (read_id, *, modality, primer_id, is_reverse, fastq, min_len=None, max_len=None),
text_signature = "($self, read_id, *, modality, primer_id, is_reverse, fastq, min_len=None, max_len=None)",
)]
pub fn add_read(
&mut self,
Expand All @@ -64,6 +68,8 @@ impl SeqSpec {
primer_id: &str,
is_reverse: bool,
fastq: Bound<'_, PyAny>,
min_len: Option<usize>,
max_len: Option<usize>,
) -> Result<()> {
let fastq = if fastq.is_instance_of::<pyo3::types::PyList>() {
fastq.extract::<Vec<PathBuf>>()?
Expand All @@ -82,10 +88,20 @@ impl SeqSpec {
} else {
read.strand = Strand::Pos;
}
read.read_id = read_id.to_string();
read.modality = Modality::from_str(modality)?;
read.primer_id = primer_id.to_string();
read.files = Some(fastq.into_iter().map(|path| make_file_path(path)).collect::<Result<Vec<File>>>()?);

if min_len.is_none() || max_len.is_none() {
let len = precellar::io::get_read_length(&read, "./")?;
read.min_len = min_len.unwrap_or(len) as u32;
read.max_len = max_len.unwrap_or(len) as u32;
} else {
read.min_len = min_len.unwrap() as u32;
read.max_len = max_len.unwrap() as u32;
}

reads.push(read);
assay.sequence_spec = Some(reads);
Ok(())
Expand All @@ -112,16 +128,55 @@ impl SeqSpec {

fn __repr__(&self) -> String {
let assay = &self.0;
let tree = Tree::new("".to_string())
.with_leaves(assay.library_spec.as_ref().unwrap_or(&Vec::new()).iter().map(|region| build_tree(region)));
let mut read_list = HashMap::new();
if let Some(reads) = assay.sequence_spec.as_ref() {
for read in reads {
read_list.entry(read.primer_id.clone()).or_insert(Vec::new()).push(read);
}
}

let tree = Tree::new("".to_string()).with_leaves(
assay.library_spec.as_ref()
.unwrap_or(&Vec::new()).iter()
.map(|region| build_tree(region, &read_list))
);
format!("{}", tree)
}

fn __str__(&self) -> String {
self.__repr__()
}
}

fn build_tree(region: &Region) -> Tree<String> {
Tree::new(region.region_id.clone())
fn build_tree(region: &Region, read_list: &HashMap<String, Vec<&Read>>) -> Tree<String> {
let id = &region.region_id;
let len = if region.min_len == region.max_len {
region.min_len.to_string()
} else {
format!("{}-{}", region.min_len, region.max_len)
};
let label = if let Some(reads) = read_list.get(id) {
let s = reads.iter().map(|r| format_read(r)).collect::<Vec<String>>().join(", ");
format!("{}({}) [{}]", id, len, s)
} else {
format!("{}({})", id, len)
};
Tree::new(label)
.with_leaves(region.regions.as_ref().unwrap_or(&Vec::new())
.iter().map(|child| build_tree(child)))
.iter().map(|child| build_tree(child, read_list)))
}

fn format_read(read: &Read) -> String {
let len = if read.min_len == read.max_len {
read.min_len.to_string()
} else {
format!("{}-{}", read.min_len, read.max_len)
};
if read.is_reverse() {
format!("↑{}({})", read.read_id, len)
} else {
format!("↓{}({})", read.read_id, len)
}
}

fn make_file_path(path: PathBuf) -> Result<File> {
Expand Down
33 changes: 33 additions & 0 deletions python/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,39 @@ use anyhow::Result;
use pyo3::prelude::*;
use regex::Regex;

/// Remove barcode from the read names of fastq records.
///
/// The old practice of storing barcodes in read names is not recommended. This
/// function extracts barcodes from read names using regular expressions and
/// writes them to a separate fastq file.
///
/// Parameters
/// ----------
/// in_fq: Path
/// File path to the input fastq file.
/// out_fq: Path
/// File path to the output fastq file.
/// regex: str
/// Extract barcodes from read names of BAM records using regular expressions.
/// Reguler expressions should contain exactly one capturing group
/// (Parentheses group the regex between them) that matches
/// the barcodes. For example, `barcode_regex="(..:..:..:..):\\\\w+$"`
/// extracts `bd:69:Y6:10` from
/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`.
/// You can test your regex on this website: https://regex101.com/.
/// out_barcode: Path | None
/// File path to the output fastq file containing the extracted barcodes.
/// If None, the barcodes will not be written to a file.
/// left_add: int
/// Additional bases to strip from the left side of the barcode.
/// right_add: int
/// Additional bases to strip from the right side of the barcode.
/// compression: str | None
/// Compression algorithm to use. If None, the compression algorithm will be inferred from the file extension.
/// compression_level: int | None
/// Compression level to use.
/// num_threads: int
/// The number of threads to use.
#[pyfunction]
#[pyo3(
signature = (in_fq, out_fq,
Expand Down
2 changes: 1 addition & 1 deletion seqspec_templates/10x_rna_atac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ library_spec:
regions: null
- !Region
parent_id: atac
region_id: gDNA
region_id: atac-gDNA
region_type: gdna
name: gDNA
sequence_type: random
Expand Down

0 comments on commit 90b188a

Please sign in to comment.