Skip to content

Commit

Permalink
Fix compilation of parquet-tools (#161)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhorstmann authored Jul 25, 2022
1 parent 82b1115 commit 9d7f4d8
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 40 deletions.
2 changes: 1 addition & 1 deletion parquet-tools/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ name = "parquet_tools"
path = "src/main.rs"

[dependencies]
parquet2 = { version = "0.1", path = "../" }
parquet2 = { version = "0.14", path = "../" }
clap = {version = "2.33", features = ["yaml"]}
26 changes: 14 additions & 12 deletions parquet-tools/src/lib/dump.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
//! Subcommand `dump`. This subcommand shows the parquet metadata information
use parquet2::{
read::{
get_page_iterator, read_metadata, BinaryPageDict, CompressedDataPage, DictPage,
FixedLenByteArrayPageDict, PrimitivePageDict,
page::{
BinaryPageDict, DataPageHeader, DictPage, FixedLenByteArrayPageDict, PrimitivePageDict,
},
read::{get_page_iterator, read_metadata},
schema::types::PhysicalType,
};

use std::{fs::File, io::Write, path::Path, sync::Arc};

use crate::{Result, SEPARATOR};
Expand Down Expand Up @@ -47,8 +48,9 @@ where
writeln!(writer, "{}", SEPARATOR)?;

for column in &columns {
let column_meta = group.column(column);
let iter = get_page_iterator(column_meta, &mut file)?;
let column_meta = &group.columns()[*column];
let iter =
get_page_iterator(column_meta, &mut file, None, Vec::with_capacity(4 * 1024))?;
for (page_ind, page) in iter.enumerate() {
let page = page?;
writeln!(
Expand All @@ -58,16 +60,16 @@ where
column,
page.uncompressed_size()
)?;
let (dict, msg_type) = match page {
CompressedDataPage::V1(page_v1) => {
if let Some(dict) = page_v1.dictionary_page {
let (dict, msg_type) = match page.header() {
DataPageHeader::V1(_) => {
if let Some(dict) = page.dictionary_page {
(dict, "PageV1")
} else {
continue;
}
}
CompressedDataPage::V2(page_v2) => {
if let Some(dict) = page_v2.dictionary_page {
DataPageHeader::V2(_) => {
if let Some(dict) = page.dictionary_page {
(dict, "PageV2")
} else {
continue;
Expand Down Expand Up @@ -127,7 +129,7 @@ where
if let Some(res) = dict.as_any().downcast_ref::<BinaryPageDict>() {
for (i, pair) in res.offsets().windows(2).enumerate().take(sample_size) {
let bytes = &res.values()[pair[0] as usize..pair[1] as usize];
let msg = std::str::from_utf8(&bytes).unwrap_or("").to_string();
let msg = String::from_utf8_lossy(bytes);

writeln!(writer, "Value: {:<10}\t{:?}", i, msg)?;
}
Expand All @@ -141,7 +143,7 @@ where
.enumerate()
.take(sample_size)
{
let msg = std::str::from_utf8(&bytes).unwrap_or("").to_string();
let msg = String::from_utf8_lossy(bytes);

writeln!(writer, "Value: {:<10}\t{:?}", i, msg)?;
}
Expand Down
40 changes: 14 additions & 26 deletions parquet-tools/src/lib/meta.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
//! Subcommand `meta`. This subcommand shows the parquet metadata information
use parquet2::{
read::read_metadata,
schema::{types::ParquetType, Statistics},
schema::{types::ParquetType, types::PrimitiveType},
statistics::Statistics,
};
use std::{fs::File, io::Write, path::Path};

use crate::{Result, SEPARATOR};
use std::sync::Arc;

// Shows meta data from the file. If the `extra` flag is available, then
// extra data that the file may contain is presented
Expand Down Expand Up @@ -73,8 +75,8 @@ where
writer,
"{:4}: {:27}{:?} {:?} DO:{} RC:{} SZ:{}/{}/{:.2} ENC:{:?}{}",
index,
c.column_descriptor().name(),
c.column_type(),
c.descriptor().path_in_schema.join("."),
c.physical_type(),
c.compression(),
c.data_page_offset(),
c.num_values(),
Expand All @@ -83,7 +85,7 @@ where
c.uncompressed_size() as f32 / c.compressed_size() as f32,
c.column_encoding(),
if show_stats {
statistics_str(c.column_statistics())
statistics_str(&c.statistics().transpose().unwrap())
} else {
"".to_string()
},
Expand All @@ -97,17 +99,17 @@ where
// String creator to print information from ParquetType
fn parquet_type_str(parquet_type: &ParquetType) -> String {
match parquet_type {
ParquetType::PrimitiveType {
ParquetType::PrimitiveType(PrimitiveType {
field_info,
logical_type,
converted_type,
physical_type,
} => {
}) => {
format!(
"{:27} {:?} {:?} P:{:?} L:{:?} C:{:?}",
field_info.name(),
field_info.repetition(),
field_info.id(),
&field_info.name,
field_info.repetition,
field_info.id,
physical_type,
logical_type,
converted_type,
Expand All @@ -121,33 +123,19 @@ fn parquet_type_str(parquet_type: &ParquetType) -> String {
} => {
format!(
"{:27} {:?} {:?} L:{:?} C:{:?}",
":",
field_info.repetition(),
field_info.id(),
logical_type,
converted_type,
":", field_info.repetition, field_info.id, logical_type, converted_type,
)
}
}
}

// Creates a string showing the column statistics.
// The max and min data are PLAIN encoded as Vec<u8>
fn statistics_str(statistics: &Option<Statistics>) -> String {
fn statistics_str(statistics: &Option<Arc<dyn Statistics>>) -> String {
match statistics {
None => "".to_string(),
Some(stats) => {
let max = stats.max_value.as_ref().map_or("".to_string(), |v| {
std::str::from_utf8(&v).unwrap_or("").to_string()
});

let min = stats.min_value.as_ref().map_or("".to_string(), |v| {
std::str::from_utf8(&v).unwrap_or("").to_string()
});

let null = stats.null_count.as_ref().map_or(0, |v| *v);

format!("ST:[max: {} min: {} null: {}]", max, min, null)
format!("ST:{:?}", stats.as_ref())
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/page/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub struct CompressedDataPage {
pub(crate) buffer: Vec<u8>,
compression: Compression,
uncompressed_page_size: usize,
pub(crate) dictionary_page: Option<Arc<dyn DictPage>>,
pub dictionary_page: Option<Arc<dyn DictPage>>,
pub(crate) descriptor: Descriptor,

// The offset and length in rows
Expand Down

0 comments on commit 9d7f4d8

Please sign in to comment.