From 1bace35a58832f58ea89063f0b1389caae9b8806 Mon Sep 17 00:00:00 2001 From: Matt Hammerly Date: Wed, 24 Apr 2024 15:31:31 -0700 Subject: [PATCH] add entrypoint for mmapped pyreport parser --- Cargo.toml | 2 + src/parsers/pyreport_shim.rs | 65 ++++++++++++++++ tests/common/mod.rs | 2 +- tests/test_pyreport_shim.rs | 139 ++++++++++++++++++++++++++++++----- 4 files changed, 189 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9162d22..3c8f405 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ uuid = { version = "1.8.0", features = ["v4"] } # which should use a different hash function. seahash = "4.1.0" +memmap2 = "0.9.4" + include_dir = "0.7.3" lazy_static = "1.4.0" strum = "0.26.1" diff --git a/src/parsers/pyreport_shim.rs b/src/parsers/pyreport_shim.rs index 078401b..28b7bef 100644 --- a/src/parsers/pyreport_shim.rs +++ b/src/parsers/pyreport_shim.rs @@ -1,3 +1,68 @@ +use std::{fs::File, path::PathBuf}; + +use memmap2::Mmap; + +use crate::{ + parsers::ReportBuilderCtx, + report::{ReportBuilder, SqliteReport, SqliteReportBuilder}, +}; + pub mod report_json; pub mod chunks; + +/// Parses the two parts of our Python report class and reshapes the data into a +/// `SqliteReport`. +/// +/// Reports in our Python codebase are serialized in two parts: +/// - Report JSON, which describes the files and sessions in the report +/// - Chunks file, which describes line-by-line coverage data for each file +/// +/// The parser for the report JSON inserts a +/// [`crate::report::models::SourceFile`] for each file +/// and a [`crate::report::models::Context`] for each session. It returns two +/// hashmaps: one which maps each file's "chunk index" to the database PK for +/// the `SourceFile` that was inserted for it, and one which maps each session's +/// "session_id" to the database PK for the `Context` that was inserted for it. +/// +/// The parser for the chunks file inserts a +/// [`crate::report::models::CoverageSample`] (and possibly other records) for +/// each coverage measurement contained in the chunks file. It uses the +/// results of the report JSON parser to figure out the appropriate FKs to +/// associate a measurement with its `SourceFile` and `Context`(s). +/// +/// TODO: Make this unit testable (currently relying on integration tests) +pub fn parse_pyreport( + report_json_file: &File, + chunks_file: &File, + out_path: PathBuf, +) -> Result { + let report_builder = SqliteReportBuilder::new(out_path); + + // Memory-map the input file so we don't have to read the whole thing into RAM + let mmap_handle = unsafe { Mmap::map(report_json_file)? }; + let buf = unsafe { std::str::from_utf8_unchecked(&mmap_handle[..]) }; + let mut stream = report_json::ReportOutputStream::<&str, SqliteReport, SqliteReportBuilder> { + input: buf, + state: ReportBuilderCtx::new(report_builder), + }; + // TODO handle error + let (files, sessions) = + report_json::parse_report_json(&mut stream).expect("Failed to parse report JSON"); + + // Replace our mmap handle so the first one can be unmapped + let mmap_handle = unsafe { Mmap::map(chunks_file)? }; + let buf = unsafe { std::str::from_utf8_unchecked(&mmap_handle[..]) }; + + // Move `report_builder` from the report JSON's parse context to this one + let chunks_ctx = chunks::ParseCtx::new(stream.state.report_builder, files, sessions); + let mut chunks_stream = chunks::ReportOutputStream::<&str, SqliteReport, SqliteReportBuilder> { + input: buf, + state: chunks_ctx, + }; + // TODO handle error + chunks::parse_chunks_file(&mut chunks_stream).expect("Failed to parse chunks file"); + + // Build and return the `SqliteReport` + Ok(chunks_stream.state.db.report_builder.build()) +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 8092b7d..7826337 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -8,5 +8,5 @@ pub fn sample_data_path() -> PathBuf { } pub fn read_sample_file(path: &Path) -> String { - return read_to_string(sample_data_path().join(path)).ok().unwrap(); + read_to_string(sample_data_path().join(path)).ok().unwrap() } diff --git a/tests/test_pyreport_shim.rs b/tests/test_pyreport_shim.rs index 024a51a..321f5cc 100644 --- a/tests/test_pyreport_shim.rs +++ b/tests/test_pyreport_shim.rs @@ -1,7 +1,12 @@ -use std::{collections::HashMap, path::Path}; +use std::{ + collections::HashMap, + fs::File, + path::{Path, PathBuf}, +}; use codecov_rs::{ parsers::{ + pyreport_shim, pyreport_shim::{chunks, report_json}, ReportBuilderCtx, }, @@ -18,19 +23,16 @@ type ChunksStream<'a> = chunks::ReportOutputStream<&'a str, SqliteReport, Sqlite struct Ctx { _temp_dir: TempDir, - parse_ctx: ReportBuilderCtx, + db_file: PathBuf, } fn setup() -> Ctx { let temp_dir = TempDir::new().ok().unwrap(); let db_file = temp_dir.path().to_owned().join("db.sqlite"); - let report_builder = SqliteReportBuilder::new(db_file); - let parse_ctx = ReportBuilderCtx::new(report_builder); - Ctx { _temp_dir: temp_dir, - parse_ctx, + db_file, } } @@ -42,10 +44,11 @@ fn hash_id(key: &str) -> i64 { fn test_parse_report_json() { let input = common::read_sample_file(Path::new("codecov-rs-reports-json-d2a9ba1.txt")); - let ctx = setup(); + let test_ctx = setup(); + let parse_ctx = ReportBuilderCtx::new(SqliteReportBuilder::new(test_ctx.db_file)); let mut buf = ReportJsonStream { input: &input, - state: ctx.parse_ctx, + state: parse_ctx, }; let expected_files = vec![ @@ -97,7 +100,8 @@ fn test_parse_report_json() { #[test] fn test_parse_chunks_file() { let input = common::read_sample_file(Path::new("codecov-rs-chunks-d2a9ba1.txt")); - let mut ctx = setup(); + let test_ctx = setup(); + let mut report_builder = SqliteReportBuilder::new(test_ctx.db_file); // Pretend `parse_report_json` has already run let mut report_json_files = HashMap::new(); @@ -109,26 +113,20 @@ fn test_parse_chunks_file() { .iter() .enumerate() { - let file = ctx - .parse_ctx - .report_builder - .insert_file(file.to_string()) - .unwrap(); + let file = report_builder.insert_file(file.to_string()).unwrap(); report_json_files.insert(i, file.id); } // Pretend `parse_report_json` has already run let mut report_json_sessions = HashMap::new(); - let session = ctx - .parse_ctx - .report_builder + let session = report_builder .insert_context(models::ContextType::Upload, "codecov-rs CI") .unwrap(); report_json_sessions.insert(0, session.id); // Set up to call the chunks parser let chunks_parse_ctx = chunks::ParseCtx::new( - ctx.parse_ctx.report_builder, + report_builder, report_json_files.clone(), report_json_sessions.clone(), ); @@ -213,3 +211,108 @@ fn test_parse_chunks_file() { ); } } + +#[test] +fn test_parse_pyreport() { + let report_json_file = + File::open(common::sample_data_path().join("codecov-rs-reports-json-d2a9ba1.txt")) + .expect("Failed to open report json file"); + let chunks_file = File::open(common::sample_data_path().join("codecov-rs-chunks-d2a9ba1.txt")) + .expect("Failed to open chunks file"); + let test_ctx = setup(); + + let report = pyreport_shim::parse_pyreport(&report_json_file, &chunks_file, test_ctx.db_file) + .expect("Failed to parse pyreport"); + + let expected_files = vec![ + models::SourceFile { + id: hash_id("src/report.rs"), + path: "src/report.rs".to_string(), + }, + models::SourceFile { + id: hash_id("src/report/models.rs"), + path: "src/report/models.rs".to_string(), + }, + models::SourceFile { + id: hash_id("src/report/schema.rs"), + path: "src/report/schema.rs".to_string(), + }, + ]; + + let expected_sessions = vec![models::Context { + id: hash_id("codecov-rs CI"), + context_type: models::ContextType::Upload, + name: "codecov-rs CI".to_string(), + }]; + + // Helper function for creating our expected values + fn make_sample(source_file_id: i64, line_no: i64, hits: i64) -> models::CoverageSample { + models::CoverageSample { + id: uuid::Uuid::nil(), // Ignored + source_file_id, + line_no, + coverage_type: models::CoverageType::Line, + hits: Some(hits), + hit_branches: None, + total_branches: None, + } + } + // (start_line, end_line, hits) + let covered_lines: [Vec<(i64, i64, i64)>; 3] = [ + vec![ + (17, 25, 3), + (39, 43, 2), + (45, 49, 1), + (51, 53, 1), + (55, 59, 1), + (61, 78, 1), + ], + vec![ + (5, 5, 0), + (12, 12, 0), + (22, 22, 0), + (33, 33, 0), + (45, 45, 1), + ], + vec![ + (3, 3, 0), + (10, 16, 0), + (18, 27, 0), + (29, 39, 0), + (41, 48, 0), + (50, 50, 0), + (51, 52, 5), + (53, 54, 6), + (55, 56, 5), + ], + ]; + let mut expected_coverage_samples = Vec::new(); + for (i, file) in covered_lines.iter().enumerate() { + for (start_line, end_line, hits) in file { + for line_no in *start_line..=*end_line { + expected_coverage_samples.push(make_sample(expected_files[i].id, line_no, *hits)); + } + } + } + + let actual_coverage_samples = report + .list_coverage_samples() + .expect("Failed to list coverage samples"); + let actual_contexts = report.list_contexts().expect("Failed to list contexts"); + assert_eq!(actual_contexts, expected_sessions); + assert_eq!( + actual_coverage_samples.len(), + expected_coverage_samples.len() + ); + for i in 0..actual_coverage_samples.len() { + expected_coverage_samples[i].id = actual_coverage_samples[i].id; + assert_eq!(actual_coverage_samples[i], expected_coverage_samples[i]); + + assert_eq!( + report + .list_contexts_for_sample(&actual_coverage_samples[i]) + .unwrap(), + actual_contexts + ); + } +}