From 9d7f4d8849d68457819f009ca246b898d1354189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 25 Jul 2022 16:28:32 +0200 Subject: [PATCH] Fix compilation of parquet-tools (#161) --- parquet-tools/Cargo.toml | 2 +- parquet-tools/src/lib/dump.rs | 26 ++++++++++++----------- parquet-tools/src/lib/meta.rs | 40 ++++++++++++----------------------- src/page/mod.rs | 2 +- 4 files changed, 30 insertions(+), 40 deletions(-) diff --git a/parquet-tools/Cargo.toml b/parquet-tools/Cargo.toml index c50db4441..982d5fcd3 100644 --- a/parquet-tools/Cargo.toml +++ b/parquet-tools/Cargo.toml @@ -15,5 +15,5 @@ name = "parquet_tools" path = "src/main.rs" [dependencies] -parquet2 = { version = "0.1", path = "../" } +parquet2 = { version = "0.14", path = "../" } clap = {version = "2.33", features = ["yaml"]} diff --git a/parquet-tools/src/lib/dump.rs b/parquet-tools/src/lib/dump.rs index 59339963e..ee2a7972e 100644 --- a/parquet-tools/src/lib/dump.rs +++ b/parquet-tools/src/lib/dump.rs @@ -1,11 +1,12 @@ //! Subcommand `dump`. This subcommand shows the parquet metadata information use parquet2::{ - read::{ - get_page_iterator, read_metadata, BinaryPageDict, CompressedDataPage, DictPage, - FixedLenByteArrayPageDict, PrimitivePageDict, + page::{ + BinaryPageDict, DataPageHeader, DictPage, FixedLenByteArrayPageDict, PrimitivePageDict, }, + read::{get_page_iterator, read_metadata}, schema::types::PhysicalType, }; + use std::{fs::File, io::Write, path::Path, sync::Arc}; use crate::{Result, SEPARATOR}; @@ -47,8 +48,9 @@ where writeln!(writer, "{}", SEPARATOR)?; for column in &columns { - let column_meta = group.column(column); - let iter = get_page_iterator(column_meta, &mut file)?; + let column_meta = &group.columns()[*column]; + let iter = + get_page_iterator(column_meta, &mut file, None, Vec::with_capacity(4 * 1024))?; for (page_ind, page) in iter.enumerate() { let page = page?; writeln!( @@ -58,16 +60,16 @@ where column, page.uncompressed_size() )?; - let (dict, msg_type) = match page { - CompressedDataPage::V1(page_v1) => { - if let Some(dict) = page_v1.dictionary_page { + let (dict, msg_type) = match page.header() { + DataPageHeader::V1(_) => { + if let Some(dict) = page.dictionary_page { (dict, "PageV1") } else { continue; } } - CompressedDataPage::V2(page_v2) => { - if let Some(dict) = page_v2.dictionary_page { + DataPageHeader::V2(_) => { + if let Some(dict) = page.dictionary_page { (dict, "PageV2") } else { continue; @@ -127,7 +129,7 @@ where if let Some(res) = dict.as_any().downcast_ref::() { for (i, pair) in res.offsets().windows(2).enumerate().take(sample_size) { let bytes = &res.values()[pair[0] as usize..pair[1] as usize]; - let msg = std::str::from_utf8(&bytes).unwrap_or("").to_string(); + let msg = String::from_utf8_lossy(bytes); writeln!(writer, "Value: {:<10}\t{:?}", i, msg)?; } @@ -141,7 +143,7 @@ where .enumerate() .take(sample_size) { - let msg = std::str::from_utf8(&bytes).unwrap_or("").to_string(); + let msg = String::from_utf8_lossy(bytes); writeln!(writer, "Value: {:<10}\t{:?}", i, msg)?; } diff --git a/parquet-tools/src/lib/meta.rs b/parquet-tools/src/lib/meta.rs index 16b60896a..6e000c764 100644 --- a/parquet-tools/src/lib/meta.rs +++ b/parquet-tools/src/lib/meta.rs @@ -1,11 +1,13 @@ //! Subcommand `meta`. This subcommand shows the parquet metadata information use parquet2::{ read::read_metadata, - schema::{types::ParquetType, Statistics}, + schema::{types::ParquetType, types::PrimitiveType}, + statistics::Statistics, }; use std::{fs::File, io::Write, path::Path}; use crate::{Result, SEPARATOR}; +use std::sync::Arc; // Shows meta data from the file. If the `extra` flag is available, then // extra data that the file may contain is presented @@ -73,8 +75,8 @@ where writer, "{:4}: {:27}{:?} {:?} DO:{} RC:{} SZ:{}/{}/{:.2} ENC:{:?}{}", index, - c.column_descriptor().name(), - c.column_type(), + c.descriptor().path_in_schema.join("."), + c.physical_type(), c.compression(), c.data_page_offset(), c.num_values(), @@ -83,7 +85,7 @@ where c.uncompressed_size() as f32 / c.compressed_size() as f32, c.column_encoding(), if show_stats { - statistics_str(c.column_statistics()) + statistics_str(&c.statistics().transpose().unwrap()) } else { "".to_string() }, @@ -97,17 +99,17 @@ where // String creator to print information from ParquetType fn parquet_type_str(parquet_type: &ParquetType) -> String { match parquet_type { - ParquetType::PrimitiveType { + ParquetType::PrimitiveType(PrimitiveType { field_info, logical_type, converted_type, physical_type, - } => { + }) => { format!( "{:27} {:?} {:?} P:{:?} L:{:?} C:{:?}", - field_info.name(), - field_info.repetition(), - field_info.id(), + &field_info.name, + field_info.repetition, + field_info.id, physical_type, logical_type, converted_type, @@ -121,11 +123,7 @@ fn parquet_type_str(parquet_type: &ParquetType) -> String { } => { format!( "{:27} {:?} {:?} L:{:?} C:{:?}", - ":", - field_info.repetition(), - field_info.id(), - logical_type, - converted_type, + ":", field_info.repetition, field_info.id, logical_type, converted_type, ) } } @@ -133,21 +131,11 @@ fn parquet_type_str(parquet_type: &ParquetType) -> String { // Creates a string showing the column statistics. // The max and min data are PLAIN encoded as Vec -fn statistics_str(statistics: &Option) -> String { +fn statistics_str(statistics: &Option>) -> String { match statistics { None => "".to_string(), Some(stats) => { - let max = stats.max_value.as_ref().map_or("".to_string(), |v| { - std::str::from_utf8(&v).unwrap_or("").to_string() - }); - - let min = stats.min_value.as_ref().map_or("".to_string(), |v| { - std::str::from_utf8(&v).unwrap_or("").to_string() - }); - - let null = stats.null_count.as_ref().map_or(0, |v| *v); - - format!("ST:[max: {} min: {} null: {}]", max, min, null) + format!("ST:{:?}", stats.as_ref()) } } } diff --git a/src/page/mod.rs b/src/page/mod.rs index ccf475899..299fdb78b 100644 --- a/src/page/mod.rs +++ b/src/page/mod.rs @@ -25,7 +25,7 @@ pub struct CompressedDataPage { pub(crate) buffer: Vec, compression: Compression, uncompressed_page_size: usize, - pub(crate) dictionary_page: Option>, + pub dictionary_page: Option>, pub(crate) descriptor: Descriptor, // The offset and length in rows