From 0d605a3b66cc5ae0716a2bd1234d81c33be8c0c9 Mon Sep 17 00:00:00 2001 From: Michael Kirk Date: Tue, 10 May 2022 15:12:14 -0700 Subject: [PATCH 1/3] csv writer --- geozero-cli/src/main.rs | 6 +- geozero/src/csv/csv_writer.rs | 408 ++++++++++++++++++++++++++++++++++ geozero/src/csv/mod.rs | 27 +++ geozero/src/lib.rs | 2 + 4 files changed, 442 insertions(+), 1 deletion(-) create mode 100644 geozero/src/csv/csv_writer.rs diff --git a/geozero-cli/src/main.rs b/geozero-cli/src/main.rs index e96f623f..848f401b 100644 --- a/geozero-cli/src/main.rs +++ b/geozero-cli/src/main.rs @@ -1,6 +1,6 @@ use clap::Parser; use flatgeobuf::*; -use geozero::csv::CsvReader; +use geozero::csv::{CsvReader, CsvWriter}; use geozero::error::Result; use geozero::geojson::{GeoJsonReader, GeoJsonWriter}; use geozero::svg::SvgWriter; @@ -119,6 +119,10 @@ fn process(args: Cli) -> Result<()> { let mut processor = WktWriter::new(&mut fout); transform(args, &mut processor)?; } + Some("csv") => { + let mut processor = CsvWriter::new(&mut fout); + transform(args, &mut processor)?; + } _ => panic!("Unkown output file extension"), } Ok(()) diff --git a/geozero/src/csv/csv_writer.rs b/geozero/src/csv/csv_writer.rs new file mode 100644 index 00000000..2427083d --- /dev/null +++ b/geozero/src/csv/csv_writer.rs @@ -0,0 +1,408 @@ +use crate::error::Result; +use crate::{ColumnValue, CoordDimensions, FeatureProcessor, GeomProcessor, PropertyProcessor}; +use buffering_wkt_writer::BufferingWktWriter; + +use std::io::Write; + +pub struct CsvWriter<'w, W: Write> { + csv: csv::Writer<&'w mut W>, + headers: Vec, + has_written_first_record: bool, + current_row_props: Vec, + wkt_writer: BufferingWktWriter, +} + +impl<'w, W: Write> CsvWriter<'w, W> { + pub fn new(out: &'w mut W) -> Self { + let wkt_writer = BufferingWktWriter::new(); + Self { + csv: csv::Writer::from_writer(out), + headers: vec!["geometry".to_string()], + has_written_first_record: false, + current_row_props: vec![], + wkt_writer, + } + } + + pub fn set_dims(&mut self, dims: CoordDimensions) { + self.wkt_writer.dims = dims; + } +} + +impl FeatureProcessor for CsvWriter<'_, W> { + fn dataset_begin(&mut self, _name: Option<&str>) -> Result<()> { + debug_assert_eq!(self.headers, &["geometry"]); + Ok(()) + } + fn dataset_end(&mut self) -> Result<()> { + self.headers = vec!["geometry".to_string()]; + Ok(()) + } + fn feature_begin(&mut self, _idx: u64) -> Result<()> { + debug_assert!(self.current_row_props.is_empty()); + Ok(()) + } + + fn feature_end(&mut self, _idx: u64) -> Result<()> { + if !self.has_written_first_record { + self.has_written_first_record = true; + self.csv.write_record(self.headers.clone())?; + } + + let geom = self.wkt_writer.bytes(); + self.csv.write_field(geom)?; + self.wkt_writer.clear(); + + for field in &self.current_row_props { + self.csv.write_field(field)?; + } + self.csv.write_record(None::<&[u8]>)?; + self.current_row_props.clear(); + + Ok(()) + } + fn properties_begin(&mut self) -> Result<()> { + debug_assert!(self.current_row_props.is_empty()); + Ok(()) + } + fn properties_end(&mut self) -> Result<()> { + Ok(()) + } + fn geometry_begin(&mut self) -> Result<()> { + debug_assert!(self.wkt_writer.bytes().is_empty()); + Ok(()) + } + fn geometry_end(&mut self) -> Result<()> { + Ok(()) + } +} + +impl PropertyProcessor for CsvWriter<'_, W> { + fn property(&mut self, i: usize, colname: &str, colval: &ColumnValue) -> Result { + // TODO: support mis-ordered properties? + if self.has_written_first_record { + assert_eq!( + colname, + &self.headers[i + 1], + "CSV features must all have the same column names" + ); + } else { + self.headers.push(colname.to_string()); + } + + // TODO: support non-string colval + self.current_row_props.push(colval.to_string()); + Ok(false) + } +} + +impl GeomProcessor for CsvWriter<'_, W> { + fn dimensions(&self) -> CoordDimensions { + self.wkt_writer.dimensions() + } + fn xy(&mut self, x: f64, y: f64, idx: usize) -> Result<()> { + self.wkt_writer.xy(x, y, idx) + } + fn coordinate( + &mut self, + x: f64, + y: f64, + z: Option, + m: Option, + t: Option, + tm: Option, + idx: usize, + ) -> Result<()> { + self.wkt_writer.coordinate(x, y, z, m, t, tm, idx) + } + fn point_begin(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.point_begin(idx) + } + fn point_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.point_end(idx) + } + fn empty_point(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.empty_point(idx) + } + fn multipoint_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.multipoint_begin(size, idx) + } + fn multipoint_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.multipoint_end(idx) + } + fn linestring_begin(&mut self, tagged: bool, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.linestring_begin(tagged, size, idx) + } + fn linestring_end(&mut self, tagged: bool, idx: usize) -> Result<()> { + self.wkt_writer.linestring_end(tagged, idx) + } + fn multilinestring_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.multilinestring_begin(size, idx) + } + fn multilinestring_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.multilinestring_end(idx) + } + fn polygon_begin(&mut self, tagged: bool, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.polygon_begin(tagged, size, idx) + } + fn polygon_end(&mut self, tagged: bool, idx: usize) -> Result<()> { + self.wkt_writer.polygon_end(tagged, idx) + } + fn multipolygon_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.multipolygon_begin(size, idx) + } + fn multipolygon_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.multipolygon_end(idx) + } + fn geometrycollection_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.geometrycollection_begin(size, idx) + } + fn geometrycollection_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.geometrycollection_end(idx) + } + fn circularstring_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.circularstring_begin(size, idx) + } + fn circularstring_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.circularstring_end(idx) + } + fn compoundcurve_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.compoundcurve_begin(size, idx) + } + fn compoundcurve_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.compoundcurve_end(idx) + } + fn curvepolygon_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.curvepolygon_begin(size, idx) + } + fn curvepolygon_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.curvepolygon_end(idx) + } + fn multicurve_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.multicurve_begin(size, idx) + } + fn multicurve_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.multicurve_end(idx) + } + fn multisurface_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.multisurface_begin(size, idx) + } + fn multisurface_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.multisurface_end(idx) + } + fn triangle_begin(&mut self, tagged: bool, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.triangle_begin(tagged, size, idx) + } + fn triangle_end(&mut self, tagged: bool, idx: usize) -> Result<()> { + self.wkt_writer.triangle_end(tagged, idx) + } + fn polyhedralsurface_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.polyhedralsurface_begin(size, idx) + } + fn polyhedralsurface_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.polyhedralsurface_end(idx) + } + fn tin_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer.tin_begin(size, idx) + } + fn tin_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer.tin_end(idx) + } +} + +// Writing Wkt is delegated to the WktWriter, and it's output buffered to Vec, to be eventually +// incorporated in the output of the CSV writer. Is there a better way? +mod buffering_wkt_writer { + use crate::error::Result; + use crate::{wkt::WktWriter, CoordDimensions, GeomProcessor}; + + #[derive(Default)] + pub(crate) struct BufferingWktWriter { + buffer: Vec, + pub(crate) dims: CoordDimensions, + } + + impl BufferingWktWriter { + pub(crate) fn new() -> Self { + Self::default() + } + + pub(crate) fn clear(&mut self) { + self.buffer.clear() + } + + pub(crate) fn bytes(&self) -> &[u8] { + &self.buffer + } + + fn wkt_writer(&mut self) -> WktWriter<'_, Vec> { + WktWriter::new(&mut self.buffer) + } + } + + impl GeomProcessor for BufferingWktWriter { + fn dimensions(&self) -> CoordDimensions { + self.dims + } + fn xy(&mut self, x: f64, y: f64, idx: usize) -> Result<()> { + self.wkt_writer().xy(x, y, idx) + } + fn coordinate( + &mut self, + x: f64, + y: f64, + z: Option, + m: Option, + t: Option, + tm: Option, + idx: usize, + ) -> Result<()> { + self.wkt_writer().coordinate(x, y, z, m, t, tm, idx) + } + fn point_begin(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().point_begin(idx) + } + fn point_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().point_end(idx) + } + fn empty_point(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().empty_point(idx) + } + fn multipoint_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().multipoint_begin(size, idx) + } + fn multipoint_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().multipoint_end(idx) + } + fn linestring_begin(&mut self, tagged: bool, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().linestring_begin(tagged, size, idx) + } + fn linestring_end(&mut self, tagged: bool, idx: usize) -> Result<()> { + self.wkt_writer().linestring_end(tagged, idx) + } + fn multilinestring_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().multilinestring_begin(size, idx) + } + fn multilinestring_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().multilinestring_end(idx) + } + fn polygon_begin(&mut self, tagged: bool, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().polygon_begin(tagged, size, idx) + } + fn polygon_end(&mut self, tagged: bool, idx: usize) -> Result<()> { + self.wkt_writer().polygon_end(tagged, idx) + } + fn multipolygon_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().multipolygon_begin(size, idx) + } + fn multipolygon_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().multipolygon_end(idx) + } + fn geometrycollection_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().geometrycollection_begin(size, idx) + } + fn geometrycollection_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().geometrycollection_end(idx) + } + fn circularstring_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().circularstring_begin(size, idx) + } + fn circularstring_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().circularstring_end(idx) + } + fn compoundcurve_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().compoundcurve_begin(size, idx) + } + fn compoundcurve_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().compoundcurve_end(idx) + } + fn curvepolygon_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().curvepolygon_begin(size, idx) + } + fn curvepolygon_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().curvepolygon_end(idx) + } + fn multicurve_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().multicurve_begin(size, idx) + } + fn multicurve_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().multicurve_end(idx) + } + fn multisurface_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().multisurface_begin(size, idx) + } + fn multisurface_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().multisurface_end(idx) + } + fn triangle_begin(&mut self, tagged: bool, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().triangle_begin(tagged, size, idx) + } + fn triangle_end(&mut self, tagged: bool, idx: usize) -> Result<()> { + self.wkt_writer().triangle_end(tagged, idx) + } + fn polyhedralsurface_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().polyhedralsurface_begin(size, idx) + } + fn polyhedralsurface_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().polyhedralsurface_end(idx) + } + fn tin_begin(&mut self, size: usize, idx: usize) -> Result<()> { + self.wkt_writer().tin_begin(size, idx) + } + fn tin_end(&mut self, idx: usize) -> Result<()> { + self.wkt_writer().tin_end(idx) + } + } +} + +#[cfg(test)] +mod tests { + use crate::ProcessToCsv; + + #[test] + fn geojson_to_csv() { + let input_geojson = serde_json::json!({ + "type": "FeatureCollection", + "name": "", + "features": [ + { + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [-122.329051, 47.6069] + }, + "properties": { + "address": "904 7th Av", + "type": "Car Fire", + "datetime": "05/22/2019 12:55:00 PM", + "incident number": "F190051945" + } + }, + { + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [-122.266529, 47.515984] + }, + "properties": { + "address": "9610 53rd Av S", + "type": "Aid Response", + "datetime": "05/22/2019 12:55:00 PM", + "incident number": "F190051946" + } + } + ] + }); + + let expected_output = r#"geometry,address,datetime,incident number,type +POINT(-122.329051 47.6069),904 7th Av,05/22/2019 12:55:00 PM,F190051945,Car Fire +POINT(-122.266529 47.515984),9610 53rd Av S,05/22/2019 12:55:00 PM,F190051946,Aid Response +"#; + + let actual_output = crate::geojson::GeoJson(&input_geojson.to_string()) + .to_csv() + .unwrap(); + + assert_eq!(expected_output, actual_output); + } +} diff --git a/geozero/src/csv/mod.rs b/geozero/src/csv/mod.rs index bdf993c8..8f74269e 100644 --- a/geozero/src/csv/mod.rs +++ b/geozero/src/csv/mod.rs @@ -1,4 +1,31 @@ //! CSV conversions. pub(crate) mod csv_reader; +pub(crate) mod csv_writer; pub use csv_reader::*; +pub use csv_writer::*; + +pub(crate) mod conversion { + use super::csv_writer::*; + use crate::error::Result; + use crate::GeozeroDatasource; + + /// Consume features into CSV + pub trait ProcessToCsv { + /// Consume features into CSV String. + fn to_csv(&mut self) -> Result; + } + + impl ProcessToCsv for T { + fn to_csv(&mut self) -> Result { + let mut out: Vec = Vec::new(); + { + let mut p = CsvWriter::new(&mut out); + self.process(&mut p)?; + } + String::from_utf8(out).map_err(|_| { + crate::error::GeozeroError::Geometry("Invalid UTF-8 encoding".to_string()) + }) + } + } +} diff --git a/geozero/src/lib.rs b/geozero/src/lib.rs index 0e0a76af..4a8c35f9 100644 --- a/geozero/src/lib.rs +++ b/geozero/src/lib.rs @@ -43,6 +43,8 @@ pub use property_processor::*; #[cfg(feature = "with-csv")] pub mod csv; +#[cfg(feature = "with-csv")] +pub use crate::csv::conversion::*; #[cfg(feature = "with-gdal")] pub mod gdal; From 094995ccc220fc1529d9a826434c96e07369419b Mon Sep 17 00:00:00 2001 From: Michael Kirk Date: Thu, 12 May 2022 16:25:28 -0700 Subject: [PATCH 2/3] CONTROVERSIAL: Geometry should have index 0 if its the first geometry in a feature. Revert this and run a test to see why I want this. I'm wondering if there are unintended consequences elsewhere though. --- geozero/src/geojson/geojson_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geozero/src/geojson/geojson_reader.rs b/geozero/src/geojson/geojson_reader.rs index 9d107a43..5105ac5b 100644 --- a/geozero/src/geojson/geojson_reader.rs +++ b/geozero/src/geojson/geojson_reader.rs @@ -96,7 +96,7 @@ fn process_geojson(gj: &GeoGeoJson, processor: &mut P) -> R } if let Some(ref geometry) = feature.geometry { processor.geometry_begin()?; - process_geojson_geom_n(geometry, idx, processor)?; + process_geojson_geom_n(geometry, 0, processor)?; processor.geometry_end()?; } processor.feature_end(idx as u64)?; From 31656d7ee0756c519e8a43cfe5569b74395c22c7 Mon Sep 17 00:00:00 2001 From: Michael Kirk Date: Thu, 12 May 2022 17:19:52 -0700 Subject: [PATCH 3/3] alphabetize by extension --- geozero-cli/src/main.rs | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/geozero-cli/src/main.rs b/geozero-cli/src/main.rs index 848f401b..7b8384c4 100644 --- a/geozero-cli/src/main.rs +++ b/geozero-cli/src/main.rs @@ -57,6 +57,17 @@ fn transform(args: Cli, processor: &mut P) -> Result<()> { let pathin = Path::new(&args.input); let mut filein = BufReader::new(File::open(pathin)?); match pathin.extension().and_then(OsStr::to_str) { + Some("csv") => { + let geometry_column_name = args + .csv_geometry_column + .expect("must specify --csv-geometry-column= when parsing CSV"); + let mut ds = CsvReader::new(&geometry_column_name, &mut filein); + GeozeroDatasource::process(&mut ds, processor)?; + } + Some("json") | Some("geojson") => { + let mut ds = GeoJsonReader(&mut filein); + GeozeroDatasource::process(&mut ds, processor)?; + } Some("fgb") => { let ds = FgbReader::open(&mut filein)?; let mut ds = if let Some(bbox) = &args.extent { @@ -66,21 +77,10 @@ fn transform(args: Cli, processor: &mut P) -> Result<()> { }; ds.process_features(processor)?; } - Some("json") | Some("geojson") => { - let mut ds = GeoJsonReader(&mut filein); - GeozeroDatasource::process(&mut ds, processor)?; - } Some("wkt") => { let mut ds = WktReader(&mut filein); GeozeroDatasource::process(&mut ds, processor)?; } - Some("csv") => { - let geometry_column_name = args - .csv_geometry_column - .expect("must specify --csv-geometry-column= when parsing CSV"); - let mut ds = CsvReader::new(&geometry_column_name, &mut filein); - GeozeroDatasource::process(&mut ds, processor)?; - } _ => panic!("Unkown input file extension"), }; Ok(()) @@ -89,6 +89,15 @@ fn transform(args: Cli, processor: &mut P) -> Result<()> { fn process(args: Cli) -> Result<()> { let mut fout = BufWriter::new(File::create(&args.dest)?); match args.dest.extension().and_then(OsStr::to_str) { + Some("csv") => { + let mut processor = CsvWriter::new(&mut fout); + transform(args, &mut processor)?; + } + Some("fgb") => { + let mut fgb = FgbWriter::create("fgb", GeometryType::Unknown)?; + transform(args, &mut fgb)?; + fgb.write(&mut fout)?; + } Some("json") | Some("geojson") => { let mut processor = GeoJsonWriter::new(&mut fout); transform(args, &mut processor)?; @@ -110,19 +119,10 @@ fn process(args: Cli) -> Result<()> { } transform(args, &mut processor)?; } - Some("fgb") => { - let mut fgb = FgbWriter::create("fgb", GeometryType::Unknown)?; - transform(args, &mut fgb)?; - fgb.write(&mut fout)?; - } Some("wkt") => { let mut processor = WktWriter::new(&mut fout); transform(args, &mut processor)?; } - Some("csv") => { - let mut processor = CsvWriter::new(&mut fout); - transform(args, &mut processor)?; - } _ => panic!("Unkown output file extension"), } Ok(())