Skip to content

Commit

Permalink
Merge #206
Browse files Browse the repository at this point in the history
206: stream reading features r=frewsxcv a=michaelkirk

- [x] I agree to follow the project's [code of conduct](https://github.com/georust/geo/blob/master/CODE_OF_CONDUCT.md).
- [x] I added an entry to `CHANGES.md` if knowledge of this change could be valuable to users.
---

This greatly reduces the peak memory usage needed to parse a GeoJSON FeatureCollection by utilizing the FeatureIterator's trick of seeking to the `"features": [` array without actually parsing the FeatureCollection. Note there are [some robustness issues](#207) with this approach, some of which can be improved, but overall I think it's worth the trade-off.

There is a small (0-1%) CPU regression in our deserialization bench due to this change, but the peak memory profile of the `stream_reader_writer` examples decreased 90% - from 2.22MiB to 237Kib.

Before:
<img width="1624" alt="Screen Shot 2022-09-02 at 2 42 04 PM" src="https://user-images.githubusercontent.com/217057/188239657-ade76677-f3e2-4750-b71d-bed0effbe215.png">

After:
<img width="1624" alt="Screen Shot 2022-09-02 at 2 40 29 PM" src="https://user-images.githubusercontent.com/217057/188239645-5274e0ff-71b2-4da6-8ac9-1f72d288c2bb.png">

Notice that overall memory allocations stayed the same (see #88 (comment)), but the peak usage is much lower since we never hold the entire data structure in memory at once.


Co-authored-by: Michael Kirk <michael.code@endoftheworl.de>
  • Loading branch information
bors[bot] and michaelkirk authored Sep 6, 2022
2 parents 0d92bd2 + 8be5030 commit 2f8ca02
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 84 deletions.
5 changes: 3 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
geojson::ser::to_feature_collection_string(&my_structs).unwrap();
```
* PR: <https://github.com/georust/geojson/pull/199>
* Added `geojson::{FeatureReader, FeatureWriter}` to stream the reading/writing of your custom struct to and from GeoJSON.
* PR: <https://github.com/georust/geojson/pull/199>
* Added `geojson::{FeatureReader, FeatureWriter}` to stream the reading/writing of your custom struct to and from GeoJSON, greatly reducing the memory required to process a FeatureCollection.
* PR: <https://github.com/georust/geojson/pull/199>
* PR: <https://github.com/georust/geojson/pull/205>
* PR: <https://github.com/georust/geojson/pull/206>
* Added IntoIter implementation for FeatureCollection.
* <https://github.com/georust/geojson/pull/196>
* Added `geojson::Result<T>`.
Expand Down
82 changes: 11 additions & 71 deletions src/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,21 +150,17 @@ pub fn deserialize_feature_collection<'de, T>(
where
T: Deserialize<'de>,
{
let mut deserializer = serde_json::Deserializer::from_reader(feature_collection_reader);

// PERF: rather than deserializing the entirety of the `features:` array into memory here, it'd
// be nice to stream the features. However, I ran into difficulty while trying to return any
// borrowed reference from the visitor methods (e.g. MapAccess)
let visitor = FeatureCollectionVisitor::new();
let objects = deserializer.deserialize_map(visitor)?;

Ok(objects.into_iter().map(|feature_value| {
let deserializer = feature_value.into_deserializer();
let visitor = FeatureVisitor::new();
let record: T = deserializer.deserialize_map(visitor)?;

Ok(record)
}))
#[allow(deprecated)]
let iter = crate::FeatureIterator::new(feature_collection_reader).map(
|feature_value: Result<JsonValue>| {
let deserializer = feature_value?.into_deserializer();
let visitor = FeatureVisitor::new();
let record: T = deserializer.deserialize_map(visitor)?;

Ok(record)
},
);
Ok(iter)
}

/// Build a `Vec` of structs from a GeoJson `&str`.
Expand Down Expand Up @@ -295,62 +291,6 @@ where
Ok(deserializer.deserialize_map(visitor)?)
}

struct FeatureCollectionVisitor;

impl FeatureCollectionVisitor {
fn new() -> Self {
Self
}
}

impl<'de> serde::de::Visitor<'de> for FeatureCollectionVisitor {
type Value = Vec<JsonValue>;

fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result {
write!(formatter, "a valid GeoJSON Feature object")
}

fn visit_map<A>(self, mut map_access: A) -> std::result::Result<Self::Value, A::Error>
where
A: serde::de::MapAccess<'de>,
{
let mut has_feature_collection_type = false;
let mut features = None;
while let Some((key, value)) = map_access.next_entry::<String, JsonValue>()? {
if key == "type" {
if value == JsonValue::String("FeatureCollection".to_string()) {
has_feature_collection_type = true;
} else {
return Err(Error::custom("invalid type for feature collection"));
}
} else if key == "features" {
if let JsonValue::Array(value) = value {
if features.is_some() {
return Err(Error::custom(
"Encountered more than one list of `features`",
));
}
features = Some(value);
} else {
return Err(Error::custom("`features` had unexpected value"));
}
} else {
log::warn!("foreign members are not handled by FeatureCollection deserializer")
}
}

if let Some(features) = features {
if has_feature_collection_type {
Ok(features)
} else {
Err(Error::custom("No `type` field was found"))
}
} else {
Err(Error::custom("No `features` field was found"))
}
}
}

struct FeatureVisitor<D> {
_marker: PhantomData<D>,
}
Expand Down
26 changes: 15 additions & 11 deletions src/feature_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

use crate::{Feature, Result};

use serde::Deserialize;
use std::io;
use std::marker::PhantomData;

Expand All @@ -32,10 +33,11 @@ use std::marker::PhantomData;
/// Based on example code found at <https://github.com/serde-rs/serde/issues/903#issuecomment-297488118>.
///
/// [GeoJSON Format Specification § 3.3](https://datatracker.ietf.org/doc/html/rfc7946#section-3.3)
pub struct FeatureIterator<R> {
pub struct FeatureIterator<'de, R, D = Feature> {
reader: R,
state: State,
marker: PhantomData<Feature>,
output: PhantomData<D>,
lifetime: PhantomData<&'de ()>,
}

#[derive(Debug, Copy, Clone)]
Expand All @@ -45,17 +47,18 @@ enum State {
AfterFeatures,
}

impl<R> FeatureIterator<R> {
impl<'de, R, D> FeatureIterator<'de, R, D> {
pub fn new(reader: R) -> Self {
FeatureIterator {
reader,
state: State::BeforeFeatures,
marker: PhantomData,
output: PhantomData,
lifetime: PhantomData,
}
}
}

impl<R> FeatureIterator<R>
impl<'de, R, D> FeatureIterator<'de, R, D>
where
R: io::Read,
{
Expand Down Expand Up @@ -98,11 +101,12 @@ where
}
}

impl<R> Iterator for FeatureIterator<R>
impl<'de, R, D> Iterator for FeatureIterator<'de, R, D>
where
R: io::Read,
D: Deserialize<'de>,
{
type Item = Result<Feature>;
type Item = Result<D>;

fn next(&mut self) -> Option<Self::Item> {
match self.seek_to_next_feature() {
Expand All @@ -124,9 +128,9 @@ where

#[cfg(test)]
mod tests {
use crate::FeatureIterator;
use crate::Geometry;
use crate::Value;
use super::*;
use crate::{Geometry, Value};

use std::io::BufReader;

fn fc() -> &'static str {
Expand Down Expand Up @@ -179,7 +183,7 @@ mod tests {

#[test]
fn stream_read_test() {
let mut fi = FeatureIterator::new(BufReader::new(fc().as_bytes()));
let mut fi = FeatureIterator::<_, Feature>::new(BufReader::new(fc().as_bytes()));
assert_eq!(
Geometry {
bbox: None,
Expand Down

0 comments on commit 2f8ca02

Please sign in to comment.