Skip to content

Commit

Permalink
Feature/variance (#16)
Browse files Browse the repository at this point in the history
* Add variance

* return pydict

* Naturally sort contigs

* Bump version 0.4.2

* Update Readme for 0.4.2
  • Loading branch information
Adoni5 authored Dec 8, 2023
1 parent b49759b commit 4419b43
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 16 deletions.
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[package]
name = "cnv_from_bam"
version = "0.4.1"
version = "0.4.2"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "cnv_from_bam"
crate-type = ["cdylib", "lib"]
crate-type = ["cdylib"]

[dependencies]
pyo3 = "0.20.0"
Expand All @@ -19,6 +19,7 @@ noodles-bgzf = { version = "0.25.0", features = ["libdeflate"] }
log = "0.4"
ctrlc = { version = "3.4.1", features = ["termination"] }
once_cell = { version = "1.18.0", features = ["parking_lot"] }
natord = "1.0.9"


[features]
Expand Down
32 changes: 26 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,32 @@ let result = iterate_bam_file(bam_path, Some(4), Some(60), None, None);

The results in this case are returned as a CnvResult, which has the following structure:

```rust
/// Results struct for python
```
#[pyclass]
#[derive(Debug)]
pub struct CnvResult {
pub cnv: FnvHashMap<String, Vec<f64>>,
/// The CNV per contig
#[pyo3(get)]
pub cnv: PyObject,
/// Bin width
#[pyo3(get)]
pub bin_width: usize,
/// Genome length
#[pyo3(get)]
pub genome_length: usize,
/// Variance of the whole genome
#[pyo3(get)]
pub variance: f64,
}
```

Where `result.cnv` is a hash map containing the Copy Number for each bin of `bin_width` bases for each contig in the reference genome, `result.bin_width` is the width of the bins in bases, and `result.genome_length` is the total length of the genome.
Where `result.cnv` is a Python dict `PyObject` containing the Copy Number for each bin of `bin_width` bases for each contig in the reference genome, `result.bin_width` is the width of the bins in bases, `result.genome_length` is the total length of the genome and `result.variance` is a measure of the variance across the whole genome.

Variance is calculated as the average of the squared differences from the Mean.

> [!NOTE]
> **Note**: Only the main primary mapping alignment start is binned, Supplementary and Secondary alignments are ignored.
> **Note**: Only the main primary mapping alignment start is binned, Supplementary and Secondary alignments are ignored. Supplementary alignments can be included by setting `exclude_supplementary`
**Directory analysis**
To analyse a directory of BAM files, use the `iterate_bam_dir` function:
Expand Down Expand Up @@ -81,6 +95,7 @@ Example simple plot in python
from matplotlib import pyplot as plt
import matplotlib as mpl
from pathlib import Path
from cnv_from_bam import iterate_bam_file
import numpy as np
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 3))
total = 0
Expand Down Expand Up @@ -133,7 +148,7 @@ iterate_bam_file(bam_path, _threads=4, mapq_filter=60, log_level=int(logging.WAR
| NOTSET | 0 |

> [!NOTE]
> In v0.3 a regression was introduced, whereby keeping the GIL for logging meant that BAM reading was suddenly single threaded again. Whilst it was possible to fix this and keep `PyO3-log`, I decided to go for truly maximum speed instead. The only drawback to the removal of `PyO3-log` is that log messages will not be handled by python loggers, so they won't be written out by a file handler, for example.
> In v0.3 a regression was introduced, whereby keeping the GIL for logging meant that BAM reading was suddenly single threaded again. Whilst it was possible to fix this and keep `PyO3-log`, I decided to go for truly maximum speed instead. The only drawback to the removal of `PyO3-log` in (v0.4+) is that log messages will not be handled by python loggers, so they won't be written out by a file handler, for example.

## Documentation
Expand All @@ -158,8 +173,13 @@ pre-commit install -t pre-commit -t post-checkout -t post-merge
pre-commit run --all-files
```
## Changelog
### v0.4.2
* Returns the contig names naturally sorted, rather than in random order!! For example `chr1, chr2, chr3...chr22,chrM,chrX,chrY`!
Huge, will prevent some people getting repeatedly confused about expected CNV vs. Visualised and wasting an hour debugging a non existing issue.
* Returns variance across the whole genome in the CNV result struct.

### v0.4.1
* Add `exclude_supplementary` parameter to `iterate_bam_file`, to exclude supplementary alignments
* Add `exclude_supplementary` parameter to `iterate_bam_file`, to exclude supplementary alignments (default True)

### v0.4.0
* Remove `PyO3-log` for maximum speed. This means that log messages will not be handled by python loggers. Can set log level on call to `iterate_bam_file`
Expand Down
79 changes: 71 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
use log::LevelFilter;
use log::{error, info, warn};
use log::{Level, Metadata, Record};
use natord::compare;
use noodles::bam::bai;
use noodles::{
bam, csi,
Expand All @@ -64,6 +65,7 @@ use noodles_bgzf as bgzf;
use once_cell::sync::Lazy;
use pyo3::exceptions::PyRuntimeError;
use pyo3::prelude::*;
use pyo3::types::PyDict;
use rayon::prelude::*;
use std::collections::HashSet;
use std::sync::atomic::{AtomicBool, Ordering};
Expand Down Expand Up @@ -132,6 +134,51 @@ impl BamFile {
}
}

/// Calculates the variance of a dataset.
///
/// The variance is computed as the average of the squared differences from the Mean.
/// Returns `None` if the dataset is empty.
///
/// # Arguments
///
/// * `data` - A slice of usize values representing the dataset.
///
/// # Returns
///
/// An `Option<f64>` representing the variance of the dataset.
///
/// # Examples
///
/// Basic usage:
///
/// ```rust
/// # use cnv_from_bam::calculate_variance;
/// let data = vec![1, 2, 3, 4, 5];
/// let variance = calculate_variance(&data).unwrap();
/// assert_eq!(variance, 2.0);
/// ```
///
/// When the dataset is empty:
///
/// ```
/// # use cnv_from_bam::calculate_variance;
/// let empty: Vec<usize> = vec![];
/// assert!(calculate_variance(&empty).is_none());
/// ```
pub fn calculate_variance<'a>(data: impl Iterator<Item = &'a f64>) -> Option<f64> {
let (count, sum, sum_sq) = data.fold((0, 0.0, 0.0), |(count, sum, sum_sq), &value| {
(count + 1, sum + value, sum_sq + value * value)
});

if count == 0 {
None
} else {
let mean = sum / count as f64;
let variance = (sum_sq / count as f64) - (mean * mean);
Some(variance)
}
}

/// Calculates the median of a slice of `u16` numbers.
///
/// The function first sorts the given slice in place and then computes the median.
Expand Down Expand Up @@ -258,13 +305,16 @@ pub fn calculate_cnv(
pub struct CnvResult {
/// The CNV per contig
#[pyo3(get)]
pub cnv: FnvHashMap<String, Vec<f64>>,
pub cnv: PyObject,
/// Bin width
#[pyo3(get)]
pub bin_width: usize,
/// Genome length
#[pyo3(get)]
pub genome_length: usize,
/// Variance of the whole genome
#[pyo3(get)]
pub variance: f64,
}

/// Iterates over a BAM file, filters reads based on the mapping quality (`mapq_filter`),
Expand Down Expand Up @@ -577,14 +627,27 @@ fn iterate_bam_file(
// The path is neither a directory nor a .bam file
error!("The path is neither a directory nor a .bam file.");
}

let (cnv_profile, bin_width) = calculate_cnv(*genome_length, *valid_number_reads, frequencies);
let result = CnvResult {
cnv: cnv_profile,
bin_width,
genome_length: *genome_length,
};
Ok(result)
let variance = cnv_profile.values().flatten();
let variance = calculate_variance(variance).unwrap_or(0.0);
let result = Python::with_gil(|py| {
let mut sorted_keys: Vec<_> = cnv_profile.keys().collect();
sorted_keys.sort_by(|a, b| compare(a, b));
let py_dict = PyDict::new(py);
for key in sorted_keys {
let value = cnv_profile.get(key).unwrap();
py_dict.set_item(key, value)?;
}

let result = CnvResult {
cnv: py_dict.into(),
bin_width,
genome_length: *genome_length,
variance,
};
Ok(result)
});
result
}

/// Filters a BAM record based on mapping quality and flags.
Expand Down

0 comments on commit 4419b43

Please sign in to comment.