Skip to content

Commit

Permalink
Improved RAM usage between 30 - 50%, improved pandas.DataFrame to xls…
Browse files Browse the repository at this point in the history
…x speed between 30 - 50%.
  • Loading branch information
carlvoller committed Sep 4, 2024
1 parent 8555377 commit 349f662
Show file tree
Hide file tree
Showing 11 changed files with 232 additions and 92 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
*.xlsx
.DS_STORE
__pycache__
profile.json
profile.json
*.so
80 changes: 79 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "excel-rs"
version = "0.2.1"
version = "0.3.0"
edition = "2021"

[profile.release]
Expand All @@ -15,6 +15,7 @@ csv = "1"
pyo3 = { version = "0.21", features = ["extension-module"] }
anyhow = "1.0.86"
zip = { version = "2.2.0", default-features = false, features = ["deflate-zlib-ng"] }
numpy = "0.21"

[lib]
name = "excel_rs"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "py-excel-rs"
version = "0.2.1"
version = "0.3.0"
description = "Some performant utility functions to convert common data structures to XLSX"
dependencies = ["pandas", "numpy"]
requires-python = ">=3.7"
Expand Down
10 changes: 3 additions & 7 deletions python/py_excel_rs/df_to_xlsx.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
from io import BytesIO

import pandas as pd
import numpy as np

from py_excel_rs import _excel_rs

def csv_to_xlsx(buf: bytes) -> bytes:
return _excel_rs.export_to_xlsx(buf)

def df_to_xlsx(df: pd.DataFrame) -> bytes:
buf = BytesIO()
df.to_csv(buf, index=False)

buf.seek(0)
return _excel_rs.export_to_xlsx(buf.read())
py_list = np.vstack((df.keys().to_numpy(), df.to_numpy(dtype='object')))
return _excel_rs.py_2d_to_xlsx(py_list)
21 changes: 20 additions & 1 deletion src/export_to_xlsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::io::Cursor;
use super::xlsx::WorkBook as NewWorkBook;

use anyhow::Result;
use numpy::ndarray::Array2;

pub fn export_to_custom_xlsx(x: &[u8]) -> Result<Vec<u8>> {
let output_buffer = vec![];
Expand All @@ -21,9 +22,27 @@ pub fn export_to_custom_xlsx(x: &[u8]) -> Result<Vec<u8>> {
row += 1;
}

workbook.write_worksheet(worksheet)?;
worksheet.close()?;

let final_buffer = workbook.finish()?;

Ok(final_buffer.into_inner())
}

pub fn export_ndarray_to_custom_xlsx(x: Array2<String>) -> Result<Vec<u8>> {
let output_buffer = vec![];
let mut workbook = NewWorkBook::new(Cursor::new(output_buffer));
let mut worksheet = workbook.get_worksheet(String::from("Sheet 1"));

let mut row_num = 1;
for row in x.rows() {
worksheet.write_row(row_num, row.iter().map(|x| x.as_bytes()).collect())?;
row_num += 1;
}

worksheet.close()?;

let final_buffer = workbook.finish()?;

Ok(final_buffer.into_inner())
}
35 changes: 34 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
mod export_to_xlsx;
mod xlsx;

use export_to_xlsx::export_to_custom_xlsx;
use export_to_xlsx::{export_to_custom_xlsx, export_ndarray_to_custom_xlsx};
use numpy::PyReadonlyArray2;
use pyo3::{prelude::*, types::PyBytes};

#[pymodule]
Expand All @@ -17,5 +18,37 @@ fn _excel_rs<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> {
PyBytes::new_bound(py, &xlsx_bytes)
}

#[pyfn(m)]
#[pyo3(name = "py_2d_to_xlsx")]
fn py_2d_to_xlsx<'py>(
py: Python<'py>,
list: PyReadonlyArray2<'py, PyObject>,
) -> Bound<'py, PyBytes> {
let ndarray = list.as_array();

let ndarray_str = ndarray.mapv(|x| {
if let Ok(inner_str) = x.extract::<String>(py) {
inner_str
} else {
if let Ok(inner_num) = x.extract::<f64>(py) {
if inner_num.is_nan() {
String::from("")
} else {
inner_num.to_string()
}
} else {
String::from("")
}
}
});

let xlsx_bytes = match export_ndarray_to_custom_xlsx(ndarray_str) {
Ok(b) => b,
Err(e) => panic!("{e}"),
};

PyBytes::new_bound(py, &xlsx_bytes)
}

Ok(())
}
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ fn convert_csv_to_xlsx(filename: &str) -> Result<()> {
}
fn main() {
let now = Instant::now();
match convert_csv_to_xlsx("original.csv") {
match convert_csv_to_xlsx("cavs.csv") {
Ok(_) => (),
Err(e) => panic!("{e}"),
}
Expand Down
24 changes: 11 additions & 13 deletions src/xlsx/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ use std::io::{Result, Seek, Write};

use zip::{write::SimpleFileOptions, ZipWriter};

use super::sheet::Sheet;

pub struct XlsxFormatter<W: Write + Seek> {
pub zip_writer: ZipWriter<W>,
}
Expand All @@ -13,20 +11,20 @@ impl<W: Write + Seek> XlsxFormatter<W> {
XlsxFormatter { zip_writer }
}

pub fn write_sheet(&mut self, sheet: Sheet) -> Result<()> {
let sheet_id = sheet.id;
let sheet_buf = sheet.close().ok().unwrap();
// pub fn write_sheet(&mut self, sheet: Sheet) -> Result<()> {
// let sheet_id = sheet.id;
// let sheet_buf = sheet.close().ok().unwrap();

let options = SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Deflated)
.compression_level(Some(1));
self.zip_writer
.start_file(format!("xl/worksheets/sheet{}.xml", sheet_id), options)?;
// let options = SimpleFileOptions::default()
// .compression_method(zip::CompressionMethod::Deflated)
// .compression_level(Some(1));
// self.zip_writer
// .start_file(format!("xl/worksheets/sheet{}.xml", sheet_id), options)?;

self.zip_writer.write_all(&sheet_buf)?;
// self.zip_writer.write_all(&sheet_buf)?;

Ok(())
}
// Ok(())
// }

pub fn finish(mut self, num_of_sheets: u16) -> Result<W> {
let options = SimpleFileOptions::default();
Expand Down
Loading

0 comments on commit 349f662

Please sign in to comment.