Skip to content

Commit

Permalink
Start parsing the chunks file with serde
Browse files Browse the repository at this point in the history
This implements a hand-written parser which scans through the `chunks` file line-by-line, and parses the various headers and line records with serde.

The most complex part here is parsing the line records.
If that complexity starts to be unreasonable, a hybrid approach is also possible in which the hand-written parser is used along with the simpler serde-based `header` parsers, and still falling back to the existing parser-combinator based parser for the line records.
  • Loading branch information
Swatinem committed Sep 3, 2024
1 parent 9d330dd commit e0f113e
Show file tree
Hide file tree
Showing 3 changed files with 427 additions and 4 deletions.
52 changes: 50 additions & 2 deletions core/benches/pyreport.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::collections::HashMap;

use codecov_rs::{
parsers::pyreport::{chunks, report_json},
parsers::pyreport::{chunks, chunks_serde, report_json},
report::test::{TestReport, TestReportBuilder},
};
use divan::Bencher;
Expand Down Expand Up @@ -49,7 +49,7 @@ fn simple_chunks() {
let chunks = &[
// Header and one chunk with an empty line
"{}\n<<<<< end_of_header >>>>>\n{}\n",
// No header, one chunk with a populated line and an empty line
// No header, one chunk with a populated line and an empty line
"{}\n[1, null, [[0, 1]]]\n",
// No header, two chunks, the second having just one empty line
"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
Expand Down Expand Up @@ -96,6 +96,54 @@ fn parse_chunks_file(input: &str, files: HashMap<usize, i64>, sessions: HashMap<
.unwrap();
}

#[divan::bench]
fn simple_chunks_serde() {
let chunks: &[&[u8]] = &[
// Header and one chunk with an empty line
b"{}\n<<<<< end_of_header >>>>>\n{}\n",
// No header, one chunk with a populated line and an empty line
b"{}\n[1, null, [[0, 1]]]\n",
// No header, two chunks, the second having just one empty line
b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
// Header, two chunks, the second having multiple data lines and an empty line
b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n",
];

for input in chunks {
parse_chunks_file_serde(input)
}
}

// this is currently <300 ms on my machine
#[divan::bench(sample_count = 10)]
fn complex_chunks_serde(bencher: Bencher) {
// this is a ~96M `chunks` file
let chunks =
load_fixture("pyreport/large/worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-chunks.txt");

bencher.bench(|| parse_chunks_file_serde(&chunks));
}

fn parse_chunks_file_serde(input: &[u8]) {
let mut parser = chunks_serde::Parser::new(input);
loop {
// TODO: these are just for debugging
let rest = parser.rest;
let expecting = parser.expecting;
let event = parser.next();
match event {
Ok(None) => break,
Ok(Some(_)) => {}
Err(err) => {
let rest = std::str::from_utf8(rest).unwrap();
let rest = rest.get(..32).unwrap_or(rest);
dbg!(rest, expecting);
panic!("{err}");
}
}
}
}

#[track_caller]
fn load_fixture(path: &str) -> Vec<u8> {
let path = format!("./fixtures/{path}");
Expand Down
Loading

0 comments on commit e0f113e

Please sign in to comment.