Skip to content

Commit

Permalink
Remove PyO3 log, impment rust side log level filtering (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
Adoni5 authored Dec 1, 2023
1 parent 4fb0e0d commit 558346d
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 38 deletions.
7 changes: 4 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cnv_from_bam"
version = "0.3.0"
version = "0.4.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -11,14 +11,15 @@ crate-type = ["cdylib", "lib"]
[dependencies]
pyo3 = "0.20.0"
noodles ={ version = "0.59.0", features = ["bam", "sam", "csi"] }
num_cpus = "1.13"

fnv = "1.0.7"
indicatif = {version = "*", features = ["rayon"]}
rayon = {version = "1.8.0" }
noodles-bgzf = { version = "0.25.0", features = ["libdeflate"] }
log = "0.4"
ctrlc = { version = "3.4.1", features = ["termination"] }
once_cell = { version = "1.18.0", features = ["parking_lot"] }

pyo3-log = "0.9.0"

[features]
extension-module = ["pyo3/extension-module"]
Expand Down
57 changes: 36 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,35 +109,33 @@ os.environ["CI"] = "1"
```

### Logging
We use PyO3-log for rust/python interop logging. By default, the log level is set to `INFO`.
We use the `log` crate for logging. By default, the log level is set to `INFO`, which means that the program will output the progress of the iteration of each BAM file. To disable all but warning and error logging, set the log level to `WARN` on the `iterate_bam_file` function:

> [!WARNING]
> It is required to set up a logger before a call to `iterate_bam_file` is made. If no logger is set up, the program will not output anything. To set up a logger, run the following code before calling `iterate_bam_file`:

```python
import logging
import sys
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger("cnv_from_bam")
logger.handlers.clear()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
```

It is possible to hide the log messages by setting the log level to `WARN`:

```python
import logging
import sys
FORMAT = '%(levelname)s %(name)s %(asctime)-15s %(filename)s:%(lineno)d %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger("cnv_from_bam")
logger.handlers.clear()
logger.setLevel(logging.WARN)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
from cnv_from_bam import iterate_bam_file
iterate_bam_file(bam_path, _threads=4, mapq_filter=60, log_level=logging.getLevelName(logging.WARN))

```

`getLevelName` is a function from the `logging` module that converts the log level to the integer value of the level. These values are

| Level | Value |
|-------|-------|
| CRITICAL | 50 |
| ERROR | 40 |
| WARNING | 30 |
| INFO | 20 |
| DEBUG | 10 |
| NOTSET | 0 |

> [!NOTE]
> In v0.3 a regression was introduced, whereby keeping the GIL for logging meant that BAM reading was suddenly single threaded again. Whilst it was possible to fix this and keep `PyO3-log`, I decided to go for truly maximum speed instead. The only drawback to the removal of `PyO3-log` is that log messages will not be handled by python loggers, so they won't be written out by a file handler, for example.

## Documentation

To generate the documentation, run:
Expand All @@ -159,7 +157,24 @@ pip install -e .[dev]
pre-commit install -t pre-commit -t post-checkout -t post-merge
pre-commit run --all-files
```
## Changelog

### v0.4.0
* Remove `PyO3-log` for maximum speed. This means that log messages will not be handled by python loggers. Can set log level on call to `iterate_bam_file`

### [v0.3.0](https://github.com/Adoni5/cnv_from_bam/releases/tag/v0.0.3)
* Introduce `PyO3-log` for logging. This means that log messages can be handled by python loggers, so they can be written out by a file handler, for example.
* **HAS A LARGE PERFORMANCE ISSUE**
* Can disable progress bar display by setting `CI` environment variable to `1` in python script.

### [v0.2.0](https://github.com/Adoni5/cnv_from_bam/releases/tag/v0.0.2)
* Purely rust based BAM parsing, using noodles.
* Uses a much more sensible number for threading if not provided.
* Allows iteration of BAMS in a directory

### [v0.1.0](https://github.com/Adoni5/cnv_from_bam/releases/tag/v0.0.1)
* Initial release
* Uses `rust-bio/rust-htslib` for BAM parsing. Has to bind C code, is a faff.


## License
Expand Down
83 changes: 69 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,23 @@
use fnv::FnvHashMap;
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};

use noodles::bam::bai;
use pyo3::exceptions::PyRuntimeError;
use pyo3::prelude::*;
use rayon::prelude::*;
use std::collections::HashSet;
use std::{env, error::Error, fs, fs::File, io, num::NonZeroUsize, path::PathBuf, thread};

use log::LevelFilter;
use log::{error, info, warn};
use log::{Level, Metadata, Record};
use noodles::bam::bai;
use noodles::{
bam, csi,
sam::{self, record::MappingQuality},
};
use noodles_bgzf as bgzf;
use once_cell::sync::Lazy;
use pyo3::exceptions::PyRuntimeError;
use pyo3::prelude::*;
use rayon::prelude::*;
use std::collections::HashSet;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::{env, error::Error, fs, fs::File, io, num::NonZeroUsize, path::PathBuf, thread};

/// Number of reads expected in given bin width on the genome
const READS_PER_BIN: usize = 100;
Expand All @@ -73,6 +77,10 @@ const BIN_SIZE: usize = 1000;
/// Excpected ploidy
const PLOIDY: u16 = 2;

/// Global static variable to indicate if the handler is set
static EXIT: Lazy<Arc<AtomicBool>> = Lazy::new(|| Arc::new(AtomicBool::new(false)));
/// Rust based logger
static LOGGER: SimpleLogger = SimpleLogger;
/// Dynamic result type for fancy unwrapping
pub type DynResult<T> = Result<T, Box<dyn Error + 'static>>;
/// Represents a BAM file with its path.
Expand All @@ -81,6 +89,28 @@ pub struct BamFile {
path: PathBuf,
}

/// Setup the log for the library
struct SimpleLogger;

impl log::Log for SimpleLogger {
fn enabled(&self, metadata: &Metadata) -> bool {
metadata.level() <= Level::Info
}

fn log(&self, record: &Record) {
if self.enabled(record.metadata()) {
println!(
"{} ({}) - {} - {}",
record.level(),
record.target(),
record.line().unwrap_or(0),
record.args()
);
}
}

fn flush(&self) {}
}
impl BamFile {
/// Creates a new `BamFile` instance.
///
Expand Down Expand Up @@ -313,6 +343,7 @@ fn _iterate_bam_file(
)));
}
}
// Get total number of reads, mapped and unmapped
// horrendous repetitive, dirty match to try a .csi index, then a .bai index, then no index
let bar = match File::open(format!("{}.csi", bam_file.path.to_str().unwrap()))
.map(csi::Reader::new)
Expand Down Expand Up @@ -399,16 +430,13 @@ fn _iterate_bam_file(
bar.set_draw_target(ProgressDrawTarget::hidden())
}

// Get total number of reads, mapped and unmapped

// number of reads in the bam file that match our criteria
let mut _valid_number_reads: usize = 0;
// Setup progress bar

// Allocate record
let mut record = bam::lazy::Record::default();

while bam_reader.read_lazy_record(&mut record)? != 0 {
while (bam_reader.read_lazy_record(&mut record)? != 0) & !EXIT.load(Ordering::SeqCst) {
if filter(&record, mapq_filter) {
let reference_sequence_name =
reference_sequence_name(&record, &header)?.expect("missing reference sequence ID");
Expand All @@ -423,6 +451,11 @@ fn _iterate_bam_file(
}
bar.inc(1)
}
if EXIT.load(Ordering::SeqCst) {
return Err(PyErr::new::<pyo3::exceptions::PySystemExit, _>(
"SIGINT or SIGTERM intercepted, Terminating function",
));
}
bar.finish();
info!(
"Number of reads matching criteria mapq >{} and aligned: {} for bam file: {:?}",
Expand Down Expand Up @@ -472,12 +505,29 @@ fn iterate_bam_file(
bam_file_path: PathBuf,
_threads: Option<u8>,
mapq_filter: Option<u8>,
log_level: Option<u8>,
) -> PyResult<CnvResult> {
let mapq_filter = mapq_filter.unwrap_or(60);
let mut frequencies: FnvHashMap<String, Vec<u16>> = FnvHashMap::default();
let genome_length = &mut 0_usize;
let valid_number_reads = &mut 0_usize;
let contigs: &mut HashSet<String> = &mut HashSet::new();
let level_filter = match log_level {
Some(level) => match level {
0 => LevelFilter::Off,
40 => LevelFilter::Error,
30 => LevelFilter::Warn,
20 => LevelFilter::Info,
10 => LevelFilter::Debug,
5 => LevelFilter::Trace,
_ => LevelFilter::Info,
},
None => LevelFilter::Info,
};
if let Ok(()) = log::set_logger(&LOGGER) {
log::set_max_level(level_filter)
}

if bam_file_path.is_dir() {
info!("Processing directory: {:?}", &bam_file_path);
// Iterate over all files in the directory
Expand Down Expand Up @@ -568,7 +618,12 @@ fn reference_sequence_name<'a>(
/// A Python module implemented in Rust.
#[pymodule]
fn cnv_from_bam(_py: Python, m: &PyModule) -> PyResult<()> {
pyo3_log::init();
let r = EXIT.clone();
ctrlc::set_handler(move || {
r.store(true, Ordering::SeqCst);
println!("{}[2J", 27 as char);
})
.expect("Error setting Ctrl-C handler");
m.add_function(wrap_pyfunction!(iterate_bam_file, m)?)?;
Ok(())
}
Expand All @@ -581,8 +636,8 @@ mod tests {
fn test_iterate_bam_file() {
// Call the function with the temporary BAM file
let bam_file_path = PathBuf::from("test/test.bam");
let result =
iterate_bam_file(bam_file_path, Some(4), Some(60)).expect("Error processing BAM file");
let result = iterate_bam_file(bam_file_path, Some(4), Some(60), None)
.expect("Error processing BAM file");
info!("{:#?}", result);
// Assert that the function returns a CnvBins instance
// assert_eq!(result.bins, vec![0, 0]);
Expand Down

0 comments on commit 558346d

Please sign in to comment.