Skip to content

Commit

Permalink
add read validator
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 5, 2024
1 parent add45dd commit 9f4c8d3
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 32 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ jobs:

build-wheel:
needs: build-and-test
if: ${{ startsWith(github.ref, 'refs/tags/') || contains(github.event.head_commit.message, '[wheel]') }}
uses: regulatory-genomics/precellar/.github/workflows/wheels.yml@main

publish:
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/txg_atac.ipynb
Git LFS file not shown
7 changes: 4 additions & 3 deletions precellar/src/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ impl<A: Alinger> FastqProcessor<A> {
pub fn gen_raw_fastq_records(&self) -> FastqRecords<impl BufRead> {
let modality = self.modality();
let data = self.assay.get_index_by_modality(modality).filter_map(|(read, index)| {
let regions: Vec<_> = index.index.into_iter().filter(|x| x.1.is_target()).collect();
let regions: Vec<_> = index.index.into_iter()
.filter(|x| x.1.is_barcode() || x.1.is_target()).collect();
if regions.is_empty() {
None
} else {
Expand Down Expand Up @@ -296,7 +297,7 @@ impl<R: BufRead> FastqRecords<R> {
let mut read1 = false;
let mut read2 = false;
self.0.iter().for_each(|x| {
x.subregion.iter().for_each(|(region_type, _)| if region_type.is_dna() {
x.subregion.iter().for_each(|(region_type, _)| if region_type.is_target() {
if x.is_reverse {
read1 = true;
} else {
Expand Down Expand Up @@ -346,7 +347,7 @@ impl<R: BufRead> Iterator for FastqRecords<R> {
} else {
barcode = Some(fq);
}
} else if region_type.is_dna() {
} else if region_type.is_target() {
if is_reverse {
read1 = Some(fq);
} else {
Expand Down
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "precellar-py"
version = "0.1.1-dev1"
version = "0.1.1-dev2"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
25 changes: 25 additions & 0 deletions seqspec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub mod utils;
use log::warn;
use read::{ReadSpan, RegionIndex};
pub use read::{Read, File, UrlType, Strand};
use read::ReadValidator;
use region::LibSpec;
pub use region::{Region, RegionType, SequenceType, Onlist};

Expand Down Expand Up @@ -132,6 +133,30 @@ impl Assay {
If this is not the desired behavior, please adjust the region lengths.", read.read_id, id);
}
}
let mut validators: Vec<_> = index.index.iter().map(|(region_id, _, range)| {
let region = self.library_spec.get(region_id).unwrap();
ReadValidator::new(region)
.with_range(range.start as usize ..range.end as usize)
.with_strand(read.strand)
}).collect();

read.open("./").unwrap().records().take(500).try_for_each(|record| {
let record = record?;
for validator in &mut validators {
validator.validate(record.sequence())?;
}
anyhow::Ok(())
})?;
for validator in validators {
let percent_matched = validator.frac_matched() * 100.0;
if percent_matched < 5.0 {
bail!("Read '{}' failed validation for region '{}'. \
Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched);
} else if percent_matched < 50.0 {
warn!("Read '{}' has low percentage of matched bases for region '{}'. \
Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched);
}
}
}
Ok(())
}
Expand Down
83 changes: 82 additions & 1 deletion seqspec/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use cached_path::Cache;
use noodles::fastq;
use indexmap::IndexMap;
use serde::{Deserialize, Serialize, Serializer};
use std::collections::HashSet;
use std::ops::{Deref, DerefMut};
use std::sync::Arc;
use std::{io::{BufRead, BufReader}, ops::Range, path::PathBuf};
Expand Down Expand Up @@ -202,7 +203,7 @@ pub enum ReadSpan {
MayReadThrough(String), // Read may be longer than the target region
}

#[derive(Deserialize, Serialize, Debug, Clone, PartialEq)]
#[derive(Deserialize, Serialize, Debug, Copy, Clone, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum Strand {
Pos,
Expand Down Expand Up @@ -266,4 +267,84 @@ pub enum UrlType {
Ftp,
Http,
Https,
}

pub(crate) struct ReadValidator<'a> {
region: &'a Region,
range: Option<Range<usize>>,
n_total: usize,
n_matched: usize,
onlist: Option<HashSet<String>>,
strand: Strand,
}

impl<'a> ReadValidator<'a> {
pub fn id(&self) -> &str {
&self.region.region_id
}

pub fn new(region: &'a Region) -> Self {
let onlist = if let Some(onlist) = &region.onlist {
Some(onlist.read().unwrap())
} else if region.sequence_type == SequenceType::Fixed {
Some(HashSet::from([region.sequence.clone()]))
} else {
None
};
Self {
region,
range: None,
n_total: 0,
n_matched: 0,
onlist,
strand: Strand::Pos,
}
}

pub fn with_range(mut self, range: Range<usize>) -> Self {
self.range = Some(range);
self
}

pub fn with_strand(mut self, strand: Strand) -> Self {
self.strand = strand;
self
}

pub fn frac_matched(&self) -> f64 {
self.n_matched as f64 / self.n_total as f64
}

pub fn validate(&mut self, seq: &[u8]) -> Result<()> {
self.n_total += 1;
let seq = if let Some(range) = &self.range {
&seq[range.clone()]
} else {
seq
};
if seq.len() < self.region.min_len as usize {
return Err(anyhow::anyhow!("Sequence too short: {}", seq.len()));
}
if seq.len() > self.region.max_len as usize {
return Err(anyhow::anyhow!("Sequence too long: {}", seq.len()));
}
if let Some(onlist) = &self.onlist {
match self.strand {
Strand::Neg => {
let seq = crate::utils::rev_compl(seq);
if onlist.contains(std::str::from_utf8(&seq)?) {
self.n_matched += 1;
}
},
Strand::Pos => {
if onlist.contains(std::str::from_utf8(seq)?) {
self.n_matched += 1;
}
},
}
} else {
self.n_matched += 1;
}
Ok(())
}
}
59 changes: 34 additions & 25 deletions seqspec/src/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,56 @@ use crate::read::UrlType;
use cached_path::Cache;
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use std::{io::BufRead, sync::Arc};
use std::{collections::{HashMap, HashSet}, io::BufRead, sync::Arc};
use anyhow::Result;

#[derive(Debug, Clone, PartialEq)]
pub struct LibSpec {
modalities: IndexMap<Modality, Arc<Region>>,
region_map: IndexMap<String, Arc<Region>>,
parent_map: IndexMap<String, Arc<Region>>,
parent_map: HashMap<String, Arc<Region>>,
region_map: HashMap<String, Arc<Region>>,
}

impl Serialize for LibSpec {
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.region_map.values().collect::<Vec<_>>().serialize(serializer)
self.modalities.values().collect::<Vec<_>>().serialize(serializer)
}
}

impl<'de> Deserialize<'de> for LibSpec {
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
let regions = Vec::<Region>::deserialize(deserializer)?;
Ok(LibSpec::new(regions))
Ok(LibSpec::new(regions).unwrap())
}
}

impl LibSpec {
pub fn new<I: IntoIterator<Item = Region>>(regions: I) -> Self {
/// Create a new library specification from top-level regions. The only allowed
/// region types are modality types.
pub fn new<I: IntoIterator<Item = Region>>(regions: I) -> Result<Self> {
let mut modalities = IndexMap::new();
let mut region_map = IndexMap::new();
let mut parent_map = IndexMap::new();
let mut region_map = HashMap::new();
let mut parent_map = HashMap::new();
for region in regions {
let region = Arc::new(region);
if region_map.insert(region.region_id.clone(), region.clone()).is_some() {
panic!("Duplicate region id: {}", region.region_id);
}
if let RegionType::Modality(modality) = region.region_type {
let region = Arc::new(region);
if modalities.insert(modality, region.clone()).is_some() {
panic!("Duplicate modality: {:?}", modality);
return Err(anyhow::anyhow!("Duplicate modality: {:?}", modality));
}
}
for subregion in region.subregions.iter() {
parent_map.insert(subregion.region_id.clone(), region.clone());
}
if region_map.insert(region.region_id.clone(), region.clone()).is_some() {
return Err(anyhow::anyhow!("Duplicate region id: {}", region.region_id));
}
for subregion in region.subregions.iter() {
if region_map.insert(subregion.region_id.clone(), subregion.clone()).is_some() {
return Err(anyhow::anyhow!("Duplicate region id: {}", subregion.region_id));
}
parent_map.insert(subregion.region_id.clone(), region.clone());
}
} else {
return Err(anyhow::anyhow!("Top-level regions must be modalities"));
};
}
Self { modalities, region_map, parent_map }
Ok(Self { modalities, region_map, parent_map })
}

/// Iterate over all regions with modality type in the library.
Expand Down Expand Up @@ -165,9 +172,11 @@ pub enum RegionType {
}

impl RegionType {
/// Either a barcode, UMI, or DNA/cDNA region.
pub fn is_target(&self) -> bool {
self.is_barcode() || self.is_umi() || self.is_dna()
pub fn is_modality(&self) -> bool {
match self {
RegionType::Modality(_) => true,
_ => false,
}
}

pub fn is_barcode(&self) -> bool {
Expand All @@ -184,8 +193,8 @@ impl RegionType {
}
}

/// Check if the region contains genomic sequences.
pub fn is_dna(&self) -> bool {
/// Check if the region contains region of interest (insertion).
pub fn is_target(&self) -> bool {
match self {
RegionType::Gdna | RegionType::Cdna => true,
_ => false,
Expand Down Expand Up @@ -225,7 +234,7 @@ pub struct Onlist {
}

impl Onlist {
pub fn read(&self) -> Result<Vec<String>> {
pub fn read(&self) -> Result<HashSet<String>> {
let cache = Cache::new()?;
let file = cache.cached_path(&self.url)?;
let reader = std::io::BufReader::new(crate::utils::open_file_for_read(file));
Expand All @@ -238,4 +247,4 @@ impl Onlist {
pub enum Location {
Local,
Remote,
}
}
10 changes: 10 additions & 0 deletions seqspec/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,14 @@ pub fn open_file_for_write<P: AsRef<Path>>(
},
};
Ok(writer)
}

pub fn rev_compl(seq: &[u8]) -> Vec<u8> {
seq.iter().rev().map(|&x| match x {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
_ => x,
}).collect()
}

0 comments on commit 9f4c8d3

Please sign in to comment.