Skip to content

Commit

Permalink
Set file_size attribute directly from file metadata (#576)
Browse files Browse the repository at this point in the history
Datafusion PartitionedFile struct relies on correct file url and 
file size. This file size is used for calculating offset for reading 
the parquet footer. Wrong file size can lead to query issues.

This PR aims to fix that issue. Fixes #575
  • Loading branch information
trueleo authored Dec 15, 2023
1 parent f289529 commit 7c83641
Showing 1 changed file with 2 additions and 3 deletions.
5 changes: 2 additions & 3 deletions server/src/catalog/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ pub fn create_from_parquet_file(
};

let file = std::fs::File::open(fs_file_path)?;
manifest_file.file_size = file.metadata()?.len();

let file = parquet::file::serialized_reader::SerializedFileReader::new(file)?;
let file_meta = file.metadata().file_metadata();
let row_groups = file.metadata().row_groups();
Expand All @@ -105,9 +107,6 @@ pub fn create_from_parquet_file(
manifest_file.ingestion_size = row_groups
.iter()
.fold(0, |acc, x| acc + x.total_byte_size() as u64);
manifest_file.file_size = row_groups
.iter()
.fold(0, |acc, x| acc + x.compressed_size() as u64);

let columns = column_statistics(row_groups);
manifest_file.columns = columns.into_values().collect();
Expand Down

0 comments on commit 7c83641

Please sign in to comment.