From d4d4883c2e8daaa8f8ee82488974b050ce642424 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 1 Oct 2024 22:50:35 +0800 Subject: [PATCH] rename add_read to update_read --- precellar/src/align.rs | 2 +- precellar/tests/data/human_brain_3k/spec.yaml | 385 ------------------ python/src/pyseqspec.rs | 99 +++-- seqspec/src/lib.rs | 43 +- seqspec_templates/10x_rna_atac.yaml | 58 ++- seqspec_templates/generic_atac.yaml | 26 +- 6 files changed, 176 insertions(+), 437 deletions(-) delete mode 100644 precellar/tests/data/human_brain_3k/spec.yaml diff --git a/precellar/src/align.rs b/precellar/src/align.rs index e5f5ca1..aa5fb4d 100644 --- a/precellar/src/align.rs +++ b/precellar/src/align.rs @@ -485,7 +485,7 @@ mod tests { PairedEndStats::default() ); let mut processor = FastqProcessor::new(spec, aligner) - .with_modality(Modality::Atac); + .with_modality(Modality::ATAC); processor.gen_barcoded_alignments().take(6).for_each(|x| { println!("{:?}", x); diff --git a/precellar/tests/data/human_brain_3k/spec.yaml b/precellar/tests/data/human_brain_3k/spec.yaml deleted file mode 100644 index d6a9399..0000000 --- a/precellar/tests/data/human_brain_3k/spec.yaml +++ /dev/null @@ -1,385 +0,0 @@ -!Assay -seqspec_version: 0.0.0 -assay_id: 10xMultiome -name: 10x-ATAC-RNA -doi: https://www.globenewswire.com/en/news-release/2020/09/15/2093690/0/en/10x-Genomics-First-to-Market-With-Product-to-Simultaneously-Capture-Epigenome-and-Transcriptome.html -date: 15 September 2020 -description: Single Cell Multiome ATAC + Gene Expression -modalities: -- rna -- atac -lib_struct: https://teichlab.github.io/scg_lib_structs/methods_html/10xChromium_multiome.html -library_protocol: null -library_kit: null -sequence_protocol: Illumina -sequence_kit: null -sequence_spec: -- !Read - read_id: atac/R1.fastq.gz - name: Read 1 - modality: atac - primer_id: Read_1 - min_len: 150 - max_len: 150 - strand: pos -- !Read - read_id: atac/R2.fastq.gz - name: Index - modality: atac - primer_id: Index_2 - min_len: 150 - max_len: 150 - strand: pos -- !Read - read_id: atac/R3.fastq.gz - name: Read 2 - modality: atac - primer_id: Read_2 - min_len: 150 - max_len: 150 - strand: neg - -library_spec: -- !Region - region_id: rna - region_type: rna - name: RNA - sequence_type: joined - sequence: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNXAGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG - min_len: 134 - max_len: 231 - onlist: null - regions: - - !Region - region_id: rna_illumina_p5 - region_type: illumina_p5 - name: Illumina P5 - sequence_type: fixed - sequence: AATGATACGGCGACCACCGAGATCTACAC - min_len: 29 - max_len: 29 - onlist: null - regions: null - parent_id: rna - - !Region - region_id: truseq_read1 - region_type: truseq_read1 - name: Truseq Read 1 - sequence_type: fixed - sequence: TCTTTCCCTACACGACGCTCTTCCGATCT - min_len: 10 - max_len: 10 - onlist: null - regions: null - parent_id: rna - - !Region - region_id: R1.fastq.gz - region_type: fastq - name: Read 1 FASTQ - sequence_type: joined - sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNN - min_len: 28 - max_len: 28 - onlist: null - regions: - - !Region - region_id: barcode - region_type: barcode - name: Cell Barcode - sequence_type: onlist - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: !Onlist - filename: https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/gex_737K-arc-v1.txt.gz - md5: null - location: remote - regions: null - parent_id: R1.fastq.gz - - !Region - region_id: umi - region_type: umi - name: UMI - sequence_type: random - sequence: NNNNNNNNNNNN - min_len: 12 - max_len: 12 - onlist: null - regions: null - parent_id: R1.fastq.gz - parent_id: rna - - !Region - region_id: R2.fastq.gz - region_type: fastq - name: Read 2 FASTQ - sequence_type: joined - sequence: X - min_len: 1 - max_len: 98 - onlist: null - regions: - - !Region - region_id: cdna - region_type: cdna - name: cdna - sequence_type: random - sequence: X - min_len: 1 - max_len: 98 - onlist: null - regions: null - parent_id: R2.fastq.gz - parent_id: rna - - !Region - prder: 4 - region_id: truseq_read2 - region_type: truseq_read2 - name: Truseq Read 2 - sequence_type: fixed - sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC - min_len: 34 - max_len: 34 - onlist: null - regions: null - parent_id: rna - - !Region - region_id: I1.fastq.gz - region_type: fastq - name: Index Read 2 - sequence_type: joined - sequence: NNNNNNNN - min_len: 8 - max_len: 8 - onlist: null - regions: - - !Region - region_id: rna_index7 - region_type: index7 - name: Truseq Read 2 - sequence_type: onlist - sequence: NNNNNNNN - min_len: 8 - max_len: 8 - onlist: !Onlist - filename: index7_onlist.txt - md5: null - location: local - regions: null - parent_id: I1.fastq.gz - parent_id: rna - - !Region - region_id: rna_illumina_p7 - region_type: illumina_p7 - name: Illumina P7 - sequence_type: fixed - sequence: ATCTCGTATGCCGTCTTCTGCTTG - min_len: 24 - max_len: 24 - onlist: null - regions: null - parent_id: rna - parent_id: null -- !Region - region_id: atac - region_type: atac - name: ATAC - sequence_type: joined - sequence: AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNNNNNNNCGCGTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGXXCTGTCTCTTATACACATCTCCGAGCCCACGAGACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG - min_len: 154 - max_len: 348 - onlist: null - regions: - - !Region - region_id: atac_illumina_p5 - region_type: illumina_p5 - name: Illumina P5 - sequence_type: random - sequence: AATGATACGGCGACCACCGAGATCTACAC - min_len: 29 - max_len: 29 - onlist: null - regions: null - parent_id: atac - - !Region - region_id: Index_2 - region_type: fastq - name: Index 2 - sequence_type: joined - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: null - parent_id: atac - regions: - - !Region - region_id: cell_bc - region_type: barcode - name: Cell Barcode - sequence_type: onlist - sequence: NNNNNNNNNNNNNNNN - min_len: 16 - max_len: 16 - onlist: !Onlist - filename: https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/atac_737K-arc-v1.txt.gz - md5: null - location: remote - regions: null - parent_id: Index_2 - - !Region - region_id: linker - region_type: linker - name: linker - sequence_type: fixed - sequence: CGCGTCTG - min_len: 8 - max_len: 8 - onlist: null - regions: null - parent_id: atac - - !Region - region_id: nextera_read1 - region_type: nextera_read1 - name: nextera_read1 - sequence_type: joined - sequence: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG - min_len: 33 - max_len: 33 - onlist: null - regions: - - !Region - region_id: s5 - region_type: s5 - name: s5 - sequence_type: fixed - sequence: TCGTCGGCAGCGTC - min_len: 14 - max_len: 14 - onlist: null - regions: null - parent_id: nextera_read1 - - !Region - region_id: ME1 - region_type: ME1 - name: ME1 - sequence_type: fixed - sequence: AGATGTGTATAAGAGACAG - min_len: 19 - max_len: 19 - onlist: null - regions: null - parent_id: nextera_read1 - parent_id: atac - - !Region - region_id: Read_1 - region_type: fastq - name: Read 1 FASTQ - sequence_type: joined - sequence: X - min_len: 1 - max_len: 98 - onlist: null - regions: - - !Region - region_id: gdna-1 - region_type: gdna - name: gdna 1 - sequence_type: random - sequence: X - min_len: 1 - max_len: 98 - onlist: null - regions: null - parent_id: Read_1 - parent_id: atac - - !Region - region_id: Read_2 - region_type: fastq - name: Read 2 FASTQ - sequence_type: joined - sequence: X - min_len: 1 - max_len: 98 - onlist: null - regions: - - !Region - region_id: gdna-2 - region_type: gdna - name: gdna 2 - sequence_type: random - sequence: X - min_len: 1 - max_len: 98 - onlist: null - regions: null - parent_id: Read_2 - parent_id: atac - - !Region - region_id: nextera_read2 - region_type: nextera_read2 - name: nextera_read2 - sequence_type: joined - sequence: CTGTCTCTTATACACATCTCCGAGCCCACGAGAC - min_len: 34 - max_len: 34 - onlist: null - regions: - - !Region - region_id: ME2 - region_type: ME2 - name: ME2 - sequence_type: fixed - sequence: CTGTCTCTTATACACATCT - min_len: 19 - max_len: 19 - onlist: null - regions: null - parent_id: nextera_read2 - - !Region - region_id: s7 - region_type: s7 - name: s7 - sequence_type: fixed - sequence: CCGAGCCCACGAGAC - min_len: 15 - max_len: 15 - onlist: null - regions: null - parent_id: nextera_read2 - parent_id: atac - - !Region - region_id: Index_1 - region_type: fastq - name: Index 1 FASTQ - sequence_type: joined - sequence: NNNNNNNN - min_len: 8 - max_len: 8 - onlist: null - regions: - - !Region - region_id: atac_index7 - region_type: index7 - name: index7 - sequence_type: onlist - sequence: NNNNNNNN - min_len: 8 - max_len: 8 - onlist: !Onlist - filename: index7_onlist.txt - md5: null - location: local - regions: null - parent_id: Index_1 - parent_id: atac - - !Region - region_id: atac_illumina_p7 - region_type: illumina_p7 - name: Illumina P7 - sequence_type: fixed - sequence: ATCTCGTATGCCGTCTTCTGCTTG - min_len: 24 - max_len: 24 - onlist: null - regions: null - parent_id: atac - parent_id: null diff --git a/python/src/pyseqspec.rs b/python/src/pyseqspec.rs index 0147879..9b42467 100644 --- a/python/src/pyseqspec.rs +++ b/python/src/pyseqspec.rs @@ -44,61 +44,77 @@ impl SeqSpec { Ok(SeqSpec(assay)) } - /// Add a fastq file containing reads to the AnnData object. + /// Update read information in the SeqSpec object. + /// + /// This method updates the read information in the SeqSpec object. + /// If the read does not exist, it will be created. /// /// Parameters /// ---------- /// read_id: str /// The id of the read. - /// modality: str + /// modality: str | None /// The modality of the read. - /// primer_id: str + /// primer_id: str | None /// The id of the primer. - /// is_reverse: bool + /// is_reverse: bool | None /// Whether the read is reverse. - /// fastq: Path | list[Path] + /// fastq: Path | list[Path] | None /// The path to the fastq file containing the reads. - /// min_len: int + /// min_len: int | None /// The minimum length of the read. If not provided, the minimum length is inferred from the fastq file. - /// max_len: int + /// max_len: int | None /// The maximum length of the read. If not provided, the maximum length is inferred from the fastq file. #[pyo3( - signature = (read_id, *, modality, primer_id, is_reverse, fastq, min_len=None, max_len=None), - text_signature = "($self, read_id, *, modality, primer_id, is_reverse, fastq, min_len=None, max_len=None)", + signature = (read_id, *, modality=None, primer_id=None, is_reverse=None, fastq=None, min_len=None, max_len=None), + text_signature = "($self, read_id, *, modality=None, primer_id=None, is_reverse=None, fastq=None, min_len=None, max_len=None)", )] - pub fn add_read( + pub fn update_read( &mut self, read_id: &str, - modality: &str, - primer_id: &str, - is_reverse: bool, - fastq: Bound<'_, PyAny>, + modality: Option<&str>, + primer_id: Option<&str>, + is_reverse: Option, + fastq: Option>, min_len: Option, max_len: Option, ) -> Result<()> { - let fastq = if fastq.is_instance_of::() { - fastq.extract::>()? + let mut all_reads = self.0.sequence_spec.take().unwrap_or(Vec::new()); + let mut read_exist = false; + let mut read_buffer = Read::default(); + read_buffer.read_id = read_id.to_string(); + let read = if let Some(r) = all_reads.iter_mut().find(|r| r.read_id == read_id) { + read_exist = true; + r } else { - vec![fastq.extract::()?] + &mut read_buffer }; + if !read_exist { + assert!(modality.is_some(), "modality must be provided for a new read"); + assert!(primer_id.is_some(), "primer_id must be provided for a new read"); + assert!(is_reverse.is_some(), "is_reverse must be provided for a new read"); + } - let assay = &mut self.0; - - let mut reads = assay.sequence_spec.take().unwrap_or(Vec::new()); - reads = reads.into_iter().filter(|r| r.read_id != read_id).collect(); - - let mut read = Read::default(); - if is_reverse { - read.strand = Strand::Neg; - } else { - read.strand = Strand::Pos; + if let Some(rev) = is_reverse { + read.strand = if rev { Strand::Neg } else { Strand::Pos }; } - read.read_id = read_id.to_string(); - read.modality = Modality::from_str(modality)?; - read.primer_id = primer_id.to_string(); - read.files = Some(fastq.into_iter().map(|path| make_file_path(&path)).collect::>>()?); + if let Some(modality) = modality { + read.modality = Modality::from_str(modality)?; + } + if let Some(primer_id) = primer_id { + read.primer_id = primer_id.to_string(); + } + if let Some(fastq) = fastq { + let fastq = if fastq.is_instance_of::() { + fastq.extract::>()? + } else { + vec![fastq.extract::()?] + }; + read.files = Some(fastq.into_iter().map(|path| make_file_path(&path)).collect::>>()?); + } + - if min_len.is_none() || max_len.is_none() { + if (min_len.is_none() || max_len.is_none()) && read.files.is_some() { let len = precellar::io::get_read_length(&read, "./")?; read.min_len = min_len.unwrap_or(len) as u32; read.max_len = max_len.unwrap_or(len) as u32; @@ -107,8 +123,23 @@ impl SeqSpec { read.max_len = max_len.unwrap() as u32; } - reads.push(read); - assay.sequence_spec = Some(reads); + if !read_exist { + all_reads.push(read_buffer); + } + self.0.sequence_spec = Some(all_reads); + + Ok(()) + } + + /// Delete a read from the SeqSpec object. + #[pyo3(signature = (read_id), text_signature = "($self, read_id)")] + pub fn delete_read(&mut self, read_id: &str) -> Result<()> { + let reads = self.0.sequence_spec.take(); + if let Some(r) = reads { + self.0.sequence_spec = Some( + r.into_iter().filter(|r| r.read_id != read_id).collect::>() + ); + } Ok(()) } diff --git a/seqspec/src/lib.rs b/seqspec/src/lib.rs index a29ef22..7921de5 100644 --- a/seqspec/src/lib.rs +++ b/seqspec/src/lib.rs @@ -77,27 +77,40 @@ impl Assay { } } -#[derive(Deserialize, Serialize, Debug, Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Serialize, Debug, Copy, Clone, PartialEq, Eq, Hash)] #[serde(rename_all = "lowercase")] pub enum Modality { - Dna, - Rna, + DNA, + RNA, Tag, Protein, - Atac, + ATAC, Crispr, } +impl<'de> Deserialize<'de> for Modality { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let value = Value::deserialize(deserializer)?; + match value { + Value::String(s) => Modality::from_str(&s).map_err(serde::de::Error::custom), + _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), + } + } +} + impl FromStr for Modality { type Err = anyhow::Error; fn from_str(s: &str) -> Result { - match s { - "dna" => Ok(Modality::Dna), - "rna" => Ok(Modality::Rna), + match s.to_lowercase().as_str() { + "dna" => Ok(Modality::DNA), + "rna" => Ok(Modality::RNA), "tag" => Ok(Modality::Tag), "protein" => Ok(Modality::Protein), - "atac" => Ok(Modality::Atac), + "atac" => Ok(Modality::ATAC), "crispr" => Ok(Modality::Crispr), _ => bail!("Invalid modality: {}", s), } @@ -130,7 +143,7 @@ impl<'de> Deserialize<'de> for LibraryProtocol { .collect::>(); Ok(LibraryProtocol::Custom(items)) } - _ => Err(serde::de::Error::custom("invalid value")), + _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), } } } @@ -173,7 +186,7 @@ impl<'de> Deserialize<'de> for LibraryKit { .collect::>(); Ok(LibraryKit::Custom(items)) } - _ => Err(serde::de::Error::custom("invalid value")), + _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), } } } @@ -209,7 +222,7 @@ impl <'de> Deserialize<'de> for SequenceProtocol { .collect::>(); Ok(SequenceProtocol::Custom(items)) } - _ => Err(serde::de::Error::custom("invalid value")), + _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), } } } @@ -245,7 +258,7 @@ impl<'de> Deserialize<'de> for SequenceKit { .collect::>(); Ok(SequenceKit::Custom(items)) } - _ => Err(serde::de::Error::custom("invalid value")), + _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), } } } @@ -279,7 +292,7 @@ impl Default for Read { Self { read_id: "".to_string(), name: None, - modality: Modality::Dna, + modality: Modality::DNA, primer_id: "".to_string(), min_len: 0, max_len: 0, @@ -566,10 +579,10 @@ mod tests { let yaml_str = fs::read_to_string("tests/data/spec.yaml").expect("Failed to read file"); let assay: Assay = serde_yaml::from_str(&yaml_str).expect("Failed to parse YAML"); - for (read, regions) in assay.get_index_of(Modality::Rna) { + for (read, regions) in assay.get_index_of(Modality::RNA) { println!("{}: {:?}", read.read_id, regions.into_iter().map(|x| (x.0.region_type, x.1)).collect::>()); } - for (read, regions) in assay.get_index_of(Modality::Atac) { + for (read, regions) in assay.get_index_of(Modality::ATAC) { println!("{}: {:?}", read.read_id, regions.into_iter().map(|x| (x.0.region_type, x.1)).collect::>()); } for (read, regions) in assay.get_index_of(Modality::Protein) { diff --git a/seqspec_templates/10x_rna_atac.yaml b/seqspec_templates/10x_rna_atac.yaml index 0e98753..6946757 100644 --- a/seqspec_templates/10x_rna_atac.yaml +++ b/seqspec_templates/10x_rna_atac.yaml @@ -13,7 +13,63 @@ library_protocol: Custom library_kit: Illumina Truseq Dual Index sequence_protocol: Illumina NovaSeq 6000 sequence_kit: NovaSeq 6000 v1.5 -sequence_spec: [] +sequence_spec: +- !Read + read_id: rna-R1 + name: RNA Read 1 + modality: rna + primer_id: rna-truseq_read1 + min_len: 28 + max_len: 28 + strand: pos +- !Read + read_id: rna-I1 + name: RNA Index 1 (i7 index) + modality: rna + primer_id: rna-truseq_read2 + min_len: 8 + max_len: 8 + strand: pos +- !Read + read_id: rna-R2 + name: RNA Read 2 + modality: rna + primer_id: rna-truseq_read2 + min_len: 1 + max_len: 98 + strand: neg +- !Read + read_id: atac-R1 + name: ATAC Read 1 + modality: atac + primer_id: atac-nextera_read1 + min_len: 1 + max_len: 98 + strand: pos +- !Read + read_id: atac-I1 + name: ATAC Index 1 (i7 index) + modality: atac + primer_id: atac-nextera_read2 + min_len: 8 + max_len: 8 + strand: pos +- !Read + read_id: atac-I2 + name: ATAC Index 2 (i5 index) + modality: atac + primer_id: atac-nextera_read1 + min_len: 24 + max_len: 24 + strand: neg +- !Read + read_id: atac-R2 + name: ATAC Read 2 + modality: atac + primer_id: atac-nextera_read2 + min_len: 1 + max_len: 98 + strand: neg library_spec: - !Region parent_id: null diff --git a/seqspec_templates/generic_atac.yaml b/seqspec_templates/generic_atac.yaml index d2716a7..951ca1f 100644 --- a/seqspec_templates/generic_atac.yaml +++ b/seqspec_templates/generic_atac.yaml @@ -12,7 +12,31 @@ library_protocol: single-cell ATAC-seq (OBI:0002764) library_kit: Not-Specified sequence_protocol: Custom sequence_kit: Custom -sequence_spec: [] +sequence_spec: +- !Read + read_id: R1 + name: Read 1 + modality: atac + primer_id: atac-read1 + min_len: 1 + max_len: 98 + strand: pos +- !Read + read_id: R2 + name: Read 2 + modality: atac + primer_id: atac-read2 + min_len: 1 + max_len: 98 + strand: neg +- !Read + read_id: I1 + name: Index + modality: atac + primer_id: atac-read2 + min_len: 22 + max_len: 22 + strand: pos library_spec: - !Region parent_id: null