Start parsing the chunks file with serde

This implements a hand-written parser which scans through the `chunks` file line-by-line, and parses the various headers and line records with serde. The most complex part here is parsing the line records. If that complexity starts to be unreasonable, a hybrid approach is also possible in which the hand-written parser is used along with the simpler serde-based `header` parsers, and still falling back to the existing parser-combinator based parser for the line records.
codecov · Sep 3, 2024 · e0f113e · e0f113e
1 parent 9d330dd
commit e0f113e
Show file tree

Hide file tree

Showing 3 changed files with 427 additions and 4 deletions.
diff --git a/core/benches/pyreport.rs b/core/benches/pyreport.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 
 use codecov_rs::{
-    parsers::pyreport::{chunks, report_json},
+    parsers::pyreport::{chunks, chunks_serde, report_json},
     report::test::{TestReport, TestReportBuilder},
 };
 use divan::Bencher;
@@ -49,7 +49,7 @@ fn simple_chunks() {
     let chunks = &[
         // Header and one chunk with an empty line
         "{}\n<<<<< end_of_header >>>>>\n{}\n",
-        // No header, one chunk with a populated line and an  empty line
+        // No header, one chunk with a populated line and an empty line
         "{}\n[1, null, [[0, 1]]]\n",
         // No header, two chunks, the second having just one empty line
         "{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
@@ -96,6 +96,54 @@ fn parse_chunks_file(input: &str, files: HashMap<usize, i64>, sessions: HashMap<
         .unwrap();
 }
 
+#[divan::bench]
+fn simple_chunks_serde() {
+    let chunks: &[&[u8]] = &[
+        // Header and one chunk with an empty line
+        b"{}\n<<<<< end_of_header >>>>>\n{}\n",
+        // No header, one chunk with a populated line and an empty line
+        b"{}\n[1, null, [[0, 1]]]\n",
+        // No header, two chunks, the second having just one empty line
+        b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
+        // Header, two chunks, the second having multiple data lines and an empty line
+        b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n",
+    ];
+
+    for input in chunks {
+        parse_chunks_file_serde(input)
+    }
+}
+
+// this is currently <300 ms on my machine
+#[divan::bench(sample_count = 10)]
+fn complex_chunks_serde(bencher: Bencher) {
+    // this is a ~96M `chunks` file
+    let chunks =
+        load_fixture("pyreport/large/worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-chunks.txt");
+
+    bencher.bench(|| parse_chunks_file_serde(&chunks));
+}
+
+fn parse_chunks_file_serde(input: &[u8]) {
+    let mut parser = chunks_serde::Parser::new(input);
+    loop {
+        // TODO: these are just for debugging
+        let rest = parser.rest;
+        let expecting = parser.expecting;
+        let event = parser.next();
+        match event {
+            Ok(None) => break,
+            Ok(Some(_)) => {}
+            Err(err) => {
+                let rest = std::str::from_utf8(rest).unwrap();
+                let rest = rest.get(..32).unwrap_or(rest);
+                dbg!(rest, expecting);
+                panic!("{err}");
+            }
+        }
+    }
+}
+
 #[track_caller]
 fn load_fixture(path: &str) -> Vec<u8> {
     let path = format!("./fixtures/{path}");