codecov · matt-codecov · Aug 22, 2024 · Aug 21, 2024
diff --git a/src/parsers/pyreport/report_json.rs b/src/parsers/pyreport/report_json.rs
@@ -32,7 +32,7 @@ pub type ReportOutputStream<S, R, B> = Stateful<S, ReportBuilderCtx<R, B>>;
 /// "filename.rs": [
 ///     chunks_index: int,
 ///     file_totals: ReportTotals,
-///     session_totals: SessionTotalsArray,
+///     session_totals: null, // (formerly SessionTotalsArray, but ignored now)
 ///     diff_totals: ReportTotals (probably),
 /// ]
 /// ```
@@ -41,9 +41,12 @@ pub type ReportOutputStream<S, R, B> = Stateful<S, ReportBuilderCtx<R, B>>;
 /// - [`ReportTotals`](https://github.com/codecov/shared/blob/e97a9f422a6e224b315d6dc3821f9f5ebe9b2ddd/shared/reports/types.py#L30-L45)
 /// - [`SessionTotalsArray`](https://github.com/codecov/shared/blob/e97a9f422a6e224b315d6dc3821f9f5ebe9b2ddd/shared/reports/types.py#L263-L272)
 ///
-/// `SessionTotalsArray` will normally be a dict mapping a session ID to a
-/// `SessionTotals` (which is just a type alias for `ReportTotals`) but there is
-/// a legacy format.
+/// `SessionTotalsArray` no longer exists, but older reports may still have it.
+/// It's a dict mapping a session ID to a `SessionTotals` (which is just a type
+/// alias for `ReportTotals` and a "meta" key with extra information including
+/// how many sessions there are in the map, and old reports may still have it.
+/// There's an even older format which is just a flat list. In any case, we
+/// ignore the field now.
 ///
 /// Input example:
 /// ```notrust
@@ -64,7 +67,7 @@ pub type ReportOutputStream<S, R, B> = Stateful<S, ReportBuilderCtx<R, B>>;
 ///        0,           # > complexity_total
 ///        0            # > diff
 ///      ],
-///      {              # session totals
+///      {              # session totals (usually null nowadays)
 ///        "0": [       # > key: session id
 ///          0,         # > files
 ///          45,        # > lines

diff --git a/src/report/pyreport/queries/files_to_report_json.sql b/src/report/pyreport/queries/files_to_report_json.sql
@@ -1,13 +1,25 @@
+-- Determine whether each `coverage_sample` record is a hit/miss/partial/skip.
+-- Normalize complexity fields.
 with samples_categorized as (
 select
   coverage_sample.raw_upload_id,
   coverage_sample.local_sample_id,
   coverage_sample.source_file_id,
   coverage_sample.line_no,
   coverage_sample.coverage_type,
-  iif(coverage_sample.hits > 0 or coverage_sample.hit_branches >= coverage_sample.total_branches, 1, 0) as hit,
-  iif(coverage_sample.hits = 0 or coverage_sample.hit_branches = 0, 1, 0) as miss,
-  iif(coverage_sample.hit_branches > 0 and coverage_sample.hit_branches < coverage_sample.total_branches, 1, 0) as partial,
+  iif(
+    coverage_sample.hits > 0 or coverage_sample.hit_branches >= coverage_sample.total_branches,
+    2,     -- hit
+    iif(
+      coverage_sample.hits = 0 or coverage_sample.hit_branches = 0,
+      0,   -- miss
+      iif(
+        coverage_sample.hit_branches > 0 and coverage_sample.hit_branches < coverage_sample.total_branches,
+        1, -- partial
+        -1 -- skipped
+      )
+    )
+  ) as coverage_status,
   -- If a pyreport only has total_complexity, it will basically swap total_complexity and hit_complexity_paths
   -- when pre-computing its totals/statistics. This logic performs that swap here.
   iif(method_data.hit_complexity_paths is null, method_data.total_complexity, method_data.hit_complexity_paths) as hit_complexity_paths,
@@ -20,6 +32,8 @@ on
   method_data.raw_upload_id = coverage_sample.raw_upload_id
   and method_data.local_sample_id = coverage_sample.local_sample_id
 ),
+-- Compute the chunks file index of each `source_file` record. Must match the
+-- corresponding logic in `samples_to_chunks.sql`.
 source_files_with_index as (
 select
   row_number() over (order by source_file.id) - 1 as chunk_index,
@@ -28,91 +42,42 @@ select
 from
   source_file
 ),
-file_sessions_flattened as (
+-- Each (source_file, line) has potentially many samples from different sessions
+-- and this CTE flattens them into a single record per (source_file, line).
+file_lines_flattened as (
 select
   samples_categorized.source_file_id,
   samples_categorized.line_no,
   samples_categorized.coverage_type,
-  max(samples_categorized.hit) as hit,
-  max(samples_categorized.miss) as miss,
-  max(samples_categorized.partial) as partial,
+  -- We want to pick the "most covered" status for this line. Since 2 is hit,
+  -- 1 is partial, 0 is miss, and -1 is skip, a simple `max()` does the trick.
+  -- If it was ever recorded as a full hit, that will override any partials
+  -- or misses, and so on.
+  max(samples_categorized.coverage_status) as coverage_status,
   max(samples_categorized.hit_complexity_paths) as hit_complexity_paths,
   max(samples_categorized.total_complexity) as total_complexity
 from
   samples_categorized
 group by
   1, 2, 3
-),
-file_totals as (
-select
-  file_sessions_flattened.source_file_id,
-  count(*) as file_lines,
-  sum(file_sessions_flattened.hit) as file_hits,
-  sum(file_sessions_flattened.miss) as file_misses,
-  sum(file_sessions_flattened.partial) as file_partials,
-  sum(iif(file_sessions_flattened.coverage_type = 'b', 1, 0)) as file_branches,
-  sum(iif(file_sessions_flattened.coverage_type = 'm', 1, 0)) as file_methods,
-  coalesce(sum(file_sessions_flattened.hit_complexity_paths), 0) as file_hit_complexity_paths,
-  coalesce(sum(file_sessions_flattened.total_complexity), 0) as file_total_complexity
-from
-  file_sessions_flattened
-group by
-  1
-),
-session_indices as (
-select
-  cast(row_number() over (order by raw_upload.id) - 1 as text) as session_index,
-  raw_upload.id as raw_upload_id
-from
-  raw_upload
-),
-file_session_totals as (
-select
-  session_indices.session_index,
-  session_indices.raw_upload_id,
-  samples_categorized.source_file_id,
-  count(*) as file_session_lines,
-  sum(samples_categorized.hit) as file_session_hits,
-  sum(samples_categorized.miss) as file_session_misses,
-  sum(samples_categorized.partial) as file_session_partials,
-  coalesce(sum(samples_categorized.hit_complexity_paths), 0) as file_session_hit_complexity_paths,
-  coalesce(sum(samples_categorized.total_complexity), 0) as file_session_total_complexity
-from
-  samples_categorized
-left join
-  session_indices
-on
-  session_indices.raw_upload_id = samples_categorized.raw_upload_id
-group by
-  1, 2, 3
 )
 select
   source_files_with_index.chunk_index,
   source_files_with_index.id,
   source_files_with_index.path,
-  file_totals.file_lines,
-  file_totals.file_hits,
-  file_totals.file_misses,
-  file_totals.file_partials,
-  file_totals.file_branches,
-  file_totals.file_methods,
-  file_totals.file_hit_complexity_paths,
-  file_totals.file_total_complexity,
-  file_session_totals.session_index,
-  file_session_totals.file_session_lines,
-  file_session_totals.file_session_hits,
-  file_session_totals.file_session_misses,
-  file_session_totals.file_session_partials,
-  file_session_totals.file_session_hit_complexity_paths,
-  file_session_totals.file_session_total_complexity
+  count(*) as file_lines,
+  sum(iif(file_lines_flattened.coverage_status = 2, 1, 0)) as file_hits,
+  sum(iif(file_lines_flattened.coverage_status = 0, 1, 0)) as file_misses,
+  sum(iif(file_lines_flattened.coverage_status = 1, 1, 0)) as file_partials,
+  sum(iif(file_lines_flattened.coverage_type = 'b', 1, 0)) as file_branches,
+  sum(iif(file_lines_flattened.coverage_type = 'm', 1, 0)) as file_methods,
+  coalesce(sum(file_lines_flattened.hit_complexity_paths), 0) as file_hit_complexity_paths,
+  coalesce(sum(file_lines_flattened.total_complexity), 0) as file_total_complexity
 from
-  source_files_with_index
-left join
-  file_totals
-on
-  source_files_with_index.id = file_totals.source_file_id
+  file_lines_flattened
 left join
-  file_session_totals
+  source_files_with_index
 on
-  source_files_with_index.id = file_session_totals.source_file_id;
-
+  file_lines_flattened.source_file_id = source_files_with_index.id
+group by
+  1, 2, 3