From 1bace35a58832f58ea89063f0b1389caae9b8806 Mon Sep 17 00:00:00 2001
From: Matt Hammerly <matt.hammerly@sentry.io>
Date: Wed, 24 Apr 2024 15:31:31 -0700
Subject: [PATCH] add entrypoint for mmapped pyreport parser

---
 Cargo.toml                   |   2 +
 src/parsers/pyreport_shim.rs |  65 ++++++++++++++++
 tests/common/mod.rs          |   2 +-
 tests/test_pyreport_shim.rs  | 139 ++++++++++++++++++++++++++++++-----
 4 files changed, 189 insertions(+), 19 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9162d22..3c8f405 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,8 @@ uuid = { version = "1.8.0", features = ["v4"] }
 # which should use a different hash function.
 seahash = "4.1.0"
 
+memmap2 = "0.9.4"
+
 include_dir = "0.7.3"
 lazy_static = "1.4.0"
 strum = "0.26.1"
diff --git a/src/parsers/pyreport_shim.rs b/src/parsers/pyreport_shim.rs
index 078401b..28b7bef 100644
--- a/src/parsers/pyreport_shim.rs
+++ b/src/parsers/pyreport_shim.rs
@@ -1,3 +1,68 @@
+use std::{fs::File, path::PathBuf};
+
+use memmap2::Mmap;
+
+use crate::{
+    parsers::ReportBuilderCtx,
+    report::{ReportBuilder, SqliteReport, SqliteReportBuilder},
+};
+
 pub mod report_json;
 
 pub mod chunks;
+
+/// Parses the two parts of our Python report class and reshapes the data into a
+/// `SqliteReport`.
+///
+/// Reports in our Python codebase are serialized in two parts:
+/// - Report JSON, which describes the files and sessions in the report
+/// - Chunks file, which describes line-by-line coverage data for each file
+///
+/// The parser for the report JSON inserts a
+/// [`crate::report::models::SourceFile`] for each file
+/// and a [`crate::report::models::Context`] for each session. It returns two
+/// hashmaps: one which maps each file's "chunk index" to the database PK for
+/// the `SourceFile` that was inserted for it, and one which maps each session's
+/// "session_id" to the database PK for the `Context` that was inserted for it.
+///
+/// The parser for the chunks file inserts a
+/// [`crate::report::models::CoverageSample`] (and possibly other records) for
+/// each coverage measurement contained in the chunks file. It uses the
+/// results of the report JSON parser to figure out the appropriate FKs to
+/// associate a measurement with its `SourceFile` and `Context`(s).
+///
+/// TODO: Make this unit testable (currently relying on integration tests)
+pub fn parse_pyreport(
+    report_json_file: &File,
+    chunks_file: &File,
+    out_path: PathBuf,
+) -> Result<SqliteReport, std::io::Error> {
+    let report_builder = SqliteReportBuilder::new(out_path);
+
+    // Memory-map the input file so we don't have to read the whole thing into RAM
+    let mmap_handle = unsafe { Mmap::map(report_json_file)? };
+    let buf = unsafe { std::str::from_utf8_unchecked(&mmap_handle[..]) };
+    let mut stream = report_json::ReportOutputStream::<&str, SqliteReport, SqliteReportBuilder> {
+        input: buf,
+        state: ReportBuilderCtx::new(report_builder),
+    };
+    // TODO handle error
+    let (files, sessions) =
+        report_json::parse_report_json(&mut stream).expect("Failed to parse report JSON");
+
+    // Replace our mmap handle so the first one can be unmapped
+    let mmap_handle = unsafe { Mmap::map(chunks_file)? };
+    let buf = unsafe { std::str::from_utf8_unchecked(&mmap_handle[..]) };
+
+    // Move `report_builder` from the report JSON's parse context to this one
+    let chunks_ctx = chunks::ParseCtx::new(stream.state.report_builder, files, sessions);
+    let mut chunks_stream = chunks::ReportOutputStream::<&str, SqliteReport, SqliteReportBuilder> {
+        input: buf,
+        state: chunks_ctx,
+    };
+    // TODO handle error
+    chunks::parse_chunks_file(&mut chunks_stream).expect("Failed to parse chunks file");
+
+    // Build and return the `SqliteReport`
+    Ok(chunks_stream.state.db.report_builder.build())
+}
diff --git a/tests/common/mod.rs b/tests/common/mod.rs
index 8092b7d..7826337 100644
--- a/tests/common/mod.rs
+++ b/tests/common/mod.rs
@@ -8,5 +8,5 @@ pub fn sample_data_path() -> PathBuf {
 }
 
 pub fn read_sample_file(path: &Path) -> String {
-    return read_to_string(sample_data_path().join(path)).ok().unwrap();
+    read_to_string(sample_data_path().join(path)).ok().unwrap()
 }
diff --git a/tests/test_pyreport_shim.rs b/tests/test_pyreport_shim.rs
index 024a51a..321f5cc 100644
--- a/tests/test_pyreport_shim.rs
+++ b/tests/test_pyreport_shim.rs
@@ -1,7 +1,12 @@
-use std::{collections::HashMap, path::Path};
+use std::{
+    collections::HashMap,
+    fs::File,
+    path::{Path, PathBuf},
+};
 
 use codecov_rs::{
     parsers::{
+        pyreport_shim,
         pyreport_shim::{chunks, report_json},
         ReportBuilderCtx,
     },
@@ -18,19 +23,16 @@ type ChunksStream<'a> = chunks::ReportOutputStream<&'a str, SqliteReport, Sqlite
 
 struct Ctx {
     _temp_dir: TempDir,
-    parse_ctx: ReportBuilderCtx<SqliteReport, SqliteReportBuilder>,
+    db_file: PathBuf,
 }
 
 fn setup() -> Ctx {
     let temp_dir = TempDir::new().ok().unwrap();
     let db_file = temp_dir.path().to_owned().join("db.sqlite");
 
-    let report_builder = SqliteReportBuilder::new(db_file);
-    let parse_ctx = ReportBuilderCtx::new(report_builder);
-
     Ctx {
         _temp_dir: temp_dir,
-        parse_ctx,
+        db_file,
     }
 }
 
@@ -42,10 +44,11 @@ fn hash_id(key: &str) -> i64 {
 fn test_parse_report_json() {
     let input = common::read_sample_file(Path::new("codecov-rs-reports-json-d2a9ba1.txt"));
 
-    let ctx = setup();
+    let test_ctx = setup();
+    let parse_ctx = ReportBuilderCtx::new(SqliteReportBuilder::new(test_ctx.db_file));
     let mut buf = ReportJsonStream {
         input: &input,
-        state: ctx.parse_ctx,
+        state: parse_ctx,
     };
 
     let expected_files = vec![
@@ -97,7 +100,8 @@ fn test_parse_report_json() {
 #[test]
 fn test_parse_chunks_file() {
     let input = common::read_sample_file(Path::new("codecov-rs-chunks-d2a9ba1.txt"));
-    let mut ctx = setup();
+    let test_ctx = setup();
+    let mut report_builder = SqliteReportBuilder::new(test_ctx.db_file);
 
     // Pretend `parse_report_json` has already run
     let mut report_json_files = HashMap::new();
@@ -109,26 +113,20 @@ fn test_parse_chunks_file() {
     .iter()
     .enumerate()
     {
-        let file = ctx
-            .parse_ctx
-            .report_builder
-            .insert_file(file.to_string())
-            .unwrap();
+        let file = report_builder.insert_file(file.to_string()).unwrap();
         report_json_files.insert(i, file.id);
     }
 
     // Pretend `parse_report_json` has already run
     let mut report_json_sessions = HashMap::new();
-    let session = ctx
-        .parse_ctx
-        .report_builder
+    let session = report_builder
         .insert_context(models::ContextType::Upload, "codecov-rs CI")
         .unwrap();
     report_json_sessions.insert(0, session.id);
 
     // Set up to call the chunks parser
     let chunks_parse_ctx = chunks::ParseCtx::new(
-        ctx.parse_ctx.report_builder,
+        report_builder,
         report_json_files.clone(),
         report_json_sessions.clone(),
     );
@@ -213,3 +211,108 @@ fn test_parse_chunks_file() {
         );
     }
 }
+
+#[test]
+fn test_parse_pyreport() {
+    let report_json_file =
+        File::open(common::sample_data_path().join("codecov-rs-reports-json-d2a9ba1.txt"))
+            .expect("Failed to open report json file");
+    let chunks_file = File::open(common::sample_data_path().join("codecov-rs-chunks-d2a9ba1.txt"))
+        .expect("Failed to open chunks file");
+    let test_ctx = setup();
+
+    let report = pyreport_shim::parse_pyreport(&report_json_file, &chunks_file, test_ctx.db_file)
+        .expect("Failed to parse pyreport");
+
+    let expected_files = vec![
+        models::SourceFile {
+            id: hash_id("src/report.rs"),
+            path: "src/report.rs".to_string(),
+        },
+        models::SourceFile {
+            id: hash_id("src/report/models.rs"),
+            path: "src/report/models.rs".to_string(),
+        },
+        models::SourceFile {
+            id: hash_id("src/report/schema.rs"),
+            path: "src/report/schema.rs".to_string(),
+        },
+    ];
+
+    let expected_sessions = vec![models::Context {
+        id: hash_id("codecov-rs CI"),
+        context_type: models::ContextType::Upload,
+        name: "codecov-rs CI".to_string(),
+    }];
+
+    // Helper function for creating our expected values
+    fn make_sample(source_file_id: i64, line_no: i64, hits: i64) -> models::CoverageSample {
+        models::CoverageSample {
+            id: uuid::Uuid::nil(), // Ignored
+            source_file_id,
+            line_no,
+            coverage_type: models::CoverageType::Line,
+            hits: Some(hits),
+            hit_branches: None,
+            total_branches: None,
+        }
+    }
+    // (start_line, end_line, hits)
+    let covered_lines: [Vec<(i64, i64, i64)>; 3] = [
+        vec![
+            (17, 25, 3),
+            (39, 43, 2),
+            (45, 49, 1),
+            (51, 53, 1),
+            (55, 59, 1),
+            (61, 78, 1),
+        ],
+        vec![
+            (5, 5, 0),
+            (12, 12, 0),
+            (22, 22, 0),
+            (33, 33, 0),
+            (45, 45, 1),
+        ],
+        vec![
+            (3, 3, 0),
+            (10, 16, 0),
+            (18, 27, 0),
+            (29, 39, 0),
+            (41, 48, 0),
+            (50, 50, 0),
+            (51, 52, 5),
+            (53, 54, 6),
+            (55, 56, 5),
+        ],
+    ];
+    let mut expected_coverage_samples = Vec::new();
+    for (i, file) in covered_lines.iter().enumerate() {
+        for (start_line, end_line, hits) in file {
+            for line_no in *start_line..=*end_line {
+                expected_coverage_samples.push(make_sample(expected_files[i].id, line_no, *hits));
+            }
+        }
+    }
+
+    let actual_coverage_samples = report
+        .list_coverage_samples()
+        .expect("Failed to list coverage samples");
+    let actual_contexts = report.list_contexts().expect("Failed to list contexts");
+    assert_eq!(actual_contexts, expected_sessions);
+    assert_eq!(
+        actual_coverage_samples.len(),
+        expected_coverage_samples.len()
+    );
+    for i in 0..actual_coverage_samples.len() {
+        expected_coverage_samples[i].id = actual_coverage_samples[i].id;
+        assert_eq!(actual_coverage_samples[i], expected_coverage_samples[i]);
+
+        assert_eq!(
+            report
+                .list_contexts_for_sample(&actual_coverage_samples[i])
+                .unwrap(),
+            actual_contexts
+        );
+    }
+}