-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d2a6cff
commit 5ec7e09
Showing
48 changed files
with
1,002 additions
and
292 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
[package] | ||
name = "s3" | ||
version = "0.1.0" | ||
edition = "2018" | ||
|
||
[dependencies] | ||
parquet2 = { path = "../../" } | ||
rust-s3 = { version = "0.27.0-rc4", features = ["blocking", "futures"] } | ||
futures = "0.3" | ||
tokio = { version = "1.0.0", features = ["macros", "rt-multi-thread"] } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
use std::sync::Arc; | ||
|
||
use futures::{ | ||
future::BoxFuture, | ||
pin_mut, | ||
StreamExt | ||
}; | ||
use parquet2::{ | ||
error::Result, | ||
read::{get_page_stream, read_metadata_async}, | ||
statistics::BinaryStatistics, | ||
}; | ||
use s3::Bucket; | ||
|
||
mod stream; | ||
use stream::{RangedStreamer, SeekOutput}; | ||
|
||
#[tokio::main] | ||
async fn main() -> Result<()> { | ||
let bucket_name = "ursa-labs-taxi-data"; | ||
let region = "us-east-2".parse().unwrap(); | ||
let bucket = Bucket::new_public(bucket_name, region).unwrap(); | ||
let path = "2009/01/data.parquet".to_string(); | ||
|
||
let (data, _) = bucket.head_object(&path).await.unwrap(); | ||
let length = data.content_length.unwrap() as usize; | ||
|
||
let range_get = std::sync::Arc::new(move |start: u64, length: usize| { | ||
let bucket = bucket.clone(); | ||
let path = path.clone(); | ||
Box::pin(async move { | ||
let bucket = bucket.clone(); | ||
let path = path.clone(); | ||
let (mut data, _) = bucket | ||
.get_object_range(&path, start, Some(start + length as u64)) | ||
.await | ||
.map_err(|x| std::io::Error::new(std::io::ErrorKind::Other, x.to_string()))?; | ||
|
||
data.truncate(length); | ||
Ok(SeekOutput { start, data }) | ||
}) as BoxFuture<'static, std::io::Result<SeekOutput>> | ||
}); | ||
|
||
let mut reader = RangedStreamer::new(length, 1024 * 1024, range_get); | ||
|
||
let metadata = read_metadata_async(&mut reader).await?; | ||
|
||
// metadata | ||
println!("{}", metadata.num_rows); | ||
|
||
// * first row group | ||
// * first column | ||
// * do not skip any pages | ||
let pages = get_page_stream(&metadata, 0, 0, &mut reader, vec![], Arc::new(|_, _| true)).await?; | ||
|
||
pin_mut!(pages); // needed for iteration | ||
|
||
let first_page = pages.next().await.unwrap()?; | ||
// the page statistics | ||
// first unwrap: they exist | ||
let a = first_page.statistics().unwrap()?; | ||
let a = a.as_any().downcast_ref::<BinaryStatistics>().unwrap(); | ||
println!("{:?}", a.min_value); | ||
println!("{:?}", a.max_value); | ||
println!("{:?}", a.null_count); | ||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
// Special thanks to Alice for the help: https://users.rust-lang.org/t/63019/6 | ||
use std::io::{Result, SeekFrom}; | ||
use std::pin::Pin; | ||
|
||
use futures::{ | ||
future::BoxFuture, | ||
io::{AsyncRead, AsyncSeek}, | ||
Future, | ||
}; | ||
|
||
pub struct RangedStreamer { | ||
pos: u64, | ||
length: u64, // total size | ||
state: State, | ||
range_get: F, | ||
min_request_size: usize, // requests have at least this size | ||
} | ||
|
||
enum State { | ||
HasChunk(SeekOutput), | ||
Seeking(BoxFuture<'static, std::io::Result<SeekOutput>>), | ||
} | ||
|
||
pub struct SeekOutput { | ||
pub start: u64, | ||
pub data: Vec<u8>, | ||
} | ||
|
||
pub type F = std::sync::Arc< | ||
dyn Fn(u64, usize) -> BoxFuture<'static, std::io::Result<SeekOutput>> + Send + Sync, | ||
>; | ||
|
||
impl RangedStreamer { | ||
pub fn new(length: usize, min_request_size: usize, range_get: F) -> Self { | ||
let length = length as u64; | ||
Self { | ||
pos: 0, | ||
length, | ||
state: State::HasChunk(SeekOutput { | ||
start: 0, | ||
data: vec![], | ||
}), | ||
range_get, | ||
min_request_size, | ||
} | ||
} | ||
} | ||
|
||
// whether `test_interval` is inside `a` (start, length). | ||
fn range_includes(a: (usize, usize), test_interval: (usize, usize)) -> bool { | ||
if test_interval.0 < a.0 { | ||
return false; | ||
} | ||
let test_end = test_interval.0 + test_interval.1; | ||
let a_end = a.0 + a.1; | ||
if test_end > a_end { | ||
return false; | ||
} | ||
true | ||
} | ||
|
||
impl AsyncRead for RangedStreamer { | ||
fn poll_read( | ||
mut self: std::pin::Pin<&mut Self>, | ||
cx: &mut std::task::Context<'_>, | ||
buf: &mut [u8], | ||
) -> std::task::Poll<Result<usize>> { | ||
let requested_range = (self.pos as usize, buf.len()); | ||
let min_request_size = self.min_request_size; | ||
match &mut self.state { | ||
State::HasChunk(output) => { | ||
let existing_range = (output.start as usize, output.data.len()); | ||
if range_includes(existing_range, requested_range) { | ||
let offset = requested_range.0 - existing_range.0; | ||
buf.copy_from_slice(&output.data[offset..offset + buf.len()]); | ||
self.pos += buf.len() as u64; | ||
std::task::Poll::Ready(Ok(buf.len())) | ||
} else { | ||
let start = requested_range.0 as u64; | ||
let length = std::cmp::max(min_request_size, requested_range.1); | ||
let future = (self.range_get)(start, length); | ||
self.state = State::Seeking(Box::pin(future)); | ||
self.poll_read(cx, buf) | ||
} | ||
} | ||
State::Seeking(ref mut future) => match Pin::new(future).poll(cx) { | ||
std::task::Poll::Ready(v) => { | ||
match v { | ||
Ok(output) => self.state = State::HasChunk(output), | ||
Err(e) => return std::task::Poll::Ready(Err(e)), | ||
}; | ||
self.poll_read(cx, buf) | ||
} | ||
std::task::Poll::Pending => std::task::Poll::Pending, | ||
}, | ||
} | ||
} | ||
} | ||
|
||
impl AsyncSeek for RangedStreamer { | ||
fn poll_seek( | ||
mut self: std::pin::Pin<&mut Self>, | ||
_: &mut std::task::Context<'_>, | ||
pos: std::io::SeekFrom, | ||
) -> std::task::Poll<Result<u64>> { | ||
match pos { | ||
SeekFrom::Start(pos) => self.pos = pos, | ||
SeekFrom::End(pos) => self.pos = (self.length as i64 + pos) as u64, | ||
SeekFrom::Current(pos) => self.pos = (self.pos as i64 + pos) as u64, | ||
}; | ||
std::task::Poll::Ready(Ok(self.pos)) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.