Improved RAM usage between 30 - 50%, improved pandas.DataFrame to xls…

…x speed between 30 - 50%.
carlvoller · Sep 4, 2024 · 349f662 · 349f662
1 parent 8555377
commit 349f662
Show file tree

Hide file tree

Showing 11 changed files with 232 additions and 92 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@
 *.xlsx
 .DS_STORE
 __pycache__
-profile.json
+profile.json
+*.so
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "excel-rs"
-version = "0.2.1"
+version = "0.3.0"
 edition = "2021"
 
 [profile.release]
@@ -15,6 +15,7 @@ csv = "1"
 pyo3 = { version = "0.21", features = ["extension-module"] }
 anyhow = "1.0.86"
 zip = { version = "2.2.0", default-features = false, features = ["deflate-zlib-ng"] }
+numpy = "0.21"
 
 [lib]
 name = "excel_rs"

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "py-excel-rs"
-version = "0.2.1"
+version = "0.3.0"
 description = "Some performant utility functions to convert common data structures to XLSX"
 dependencies = ["pandas", "numpy"]
 requires-python = ">=3.7"

diff --git a/python/py_excel_rs/df_to_xlsx.py b/python/py_excel_rs/df_to_xlsx.py
@@ -1,15 +1,11 @@
-from io import BytesIO
-
 import pandas as pd
+import numpy as np
 
 from py_excel_rs import _excel_rs
 
 def csv_to_xlsx(buf: bytes) -> bytes:
     return _excel_rs.export_to_xlsx(buf)
 
 def df_to_xlsx(df: pd.DataFrame) -> bytes:
-    buf = BytesIO()
-    df.to_csv(buf, index=False)
-
-    buf.seek(0)
-    return _excel_rs.export_to_xlsx(buf.read())
+    py_list = np.vstack((df.keys().to_numpy(), df.to_numpy(dtype='object')))
+    return _excel_rs.py_2d_to_xlsx(py_list)
diff --git a/src/export_to_xlsx.rs b/src/export_to_xlsx.rs
@@ -3,6 +3,7 @@ use std::io::Cursor;
 use super::xlsx::WorkBook as NewWorkBook;
 
 use anyhow::Result;
+use numpy::ndarray::Array2;
 
 pub fn export_to_custom_xlsx(x: &[u8]) -> Result<Vec<u8>> {
     let output_buffer = vec![];
@@ -21,9 +22,27 @@ pub fn export_to_custom_xlsx(x: &[u8]) -> Result<Vec<u8>> {
         row += 1;
     }
 
-    workbook.write_worksheet(worksheet)?;
+    worksheet.close()?;
 
     let final_buffer = workbook.finish()?;
 
     Ok(final_buffer.into_inner())
 }
+
+pub fn export_ndarray_to_custom_xlsx(x: Array2<String>) -> Result<Vec<u8>> {
+    let output_buffer = vec![];
+    let mut workbook = NewWorkBook::new(Cursor::new(output_buffer));
+    let mut worksheet = workbook.get_worksheet(String::from("Sheet 1"));
+
+    let mut row_num = 1;
+    for row in x.rows() {
+        worksheet.write_row(row_num, row.iter().map(|x| x.as_bytes()).collect())?;
+        row_num += 1;
+    }
+
+    worksheet.close()?;
+
+    let final_buffer = workbook.finish()?;
+
+    Ok(final_buffer.into_inner())
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,7 +1,8 @@
 mod export_to_xlsx;
 mod xlsx;
 
-use export_to_xlsx::export_to_custom_xlsx;
+use export_to_xlsx::{export_to_custom_xlsx, export_ndarray_to_custom_xlsx};
+use numpy::PyReadonlyArray2;
 use pyo3::{prelude::*, types::PyBytes};
 
 #[pymodule]
@@ -17,5 +18,37 @@ fn _excel_rs<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> {
         PyBytes::new_bound(py, &xlsx_bytes)
     }
 
+    #[pyfn(m)]
+    #[pyo3(name = "py_2d_to_xlsx")]
+    fn py_2d_to_xlsx<'py>(
+        py: Python<'py>,
+        list: PyReadonlyArray2<'py, PyObject>,
+    ) -> Bound<'py, PyBytes> {
+        let ndarray = list.as_array();
+
+        let ndarray_str = ndarray.mapv(|x| {
+            if let Ok(inner_str) = x.extract::<String>(py) {
+                inner_str
+            } else {
+                if let Ok(inner_num) = x.extract::<f64>(py) {
+                    if inner_num.is_nan() {
+                        String::from("")
+                    } else {
+                        inner_num.to_string()
+                    }
+                } else {
+                    String::from("")
+                }
+            }
+        });
+
+        let xlsx_bytes = match export_ndarray_to_custom_xlsx(ndarray_str) {
+            Ok(b) => b,
+            Err(e) => panic!("{e}"),
+        };
+
+        PyBytes::new_bound(py, &xlsx_bytes)
+    }
+
     Ok(())
 }
diff --git a/src/main.rs b/src/main.rs
@@ -22,7 +22,7 @@ fn convert_csv_to_xlsx(filename: &str) -> Result<()> {
 }
 fn main() {
     let now = Instant::now();
-    match convert_csv_to_xlsx("original.csv") {
+    match convert_csv_to_xlsx("cavs.csv") {
         Ok(_) => (),
         Err(e) => panic!("{e}"),
     }

diff --git a/src/xlsx/format.rs b/src/xlsx/format.rs
@@ -2,8 +2,6 @@ use std::io::{Result, Seek, Write};
 
 use zip::{write::SimpleFileOptions, ZipWriter};
 
-use super::sheet::Sheet;
-
 pub struct XlsxFormatter<W: Write + Seek> {
     pub zip_writer: ZipWriter<W>,
 }
@@ -13,20 +11,20 @@ impl<W: Write + Seek> XlsxFormatter<W> {
         XlsxFormatter { zip_writer }
     }
 
-    pub fn write_sheet(&mut self, sheet: Sheet) -> Result<()> {
-        let sheet_id = sheet.id;
-        let sheet_buf = sheet.close().ok().unwrap();
+    // pub fn write_sheet(&mut self, sheet: Sheet) -> Result<()> {
+    //     let sheet_id = sheet.id;
+    //     let sheet_buf = sheet.close().ok().unwrap();
 
-        let options = SimpleFileOptions::default()
-            .compression_method(zip::CompressionMethod::Deflated)
-            .compression_level(Some(1));
-        self.zip_writer
-            .start_file(format!("xl/worksheets/sheet{}.xml", sheet_id), options)?;
+    //     let options = SimpleFileOptions::default()
+    //         .compression_method(zip::CompressionMethod::Deflated)
+    //         .compression_level(Some(1));
+    //     self.zip_writer
+    //         .start_file(format!("xl/worksheets/sheet{}.xml", sheet_id), options)?;
 
-        self.zip_writer.write_all(&sheet_buf)?;
+    //     self.zip_writer.write_all(&sheet_buf)?;
 
-        Ok(())
-    }
+    //     Ok(())
+    // }
 
     pub fn finish(mut self, num_of_sheets: u16) -> Result<W> {
         let options = SimpleFileOptions::default();