Skip to content

Commit

Permalink
Added support for typed dataframe conversions
Browse files Browse the repository at this point in the history
  • Loading branch information
carlvoller committed Sep 29, 2024
1 parent d19a625 commit 6dc2389
Show file tree
Hide file tree
Showing 12 changed files with 348 additions and 16 deletions.
10 changes: 6 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@ members = ["cli-excel-rs", "crates/*", "py-excel-rs"]
resolver = "2"

[workspace.package]
version = "0.5.1"
version = "0.5.2"
authors = ["Carl Voller"]
edition = "2021"
homepage = "https://github.com/carlvoller/excel-rs"
license = "MIT"
repository = "https://github.com/carlvoller/excel-rs"

[workspace.dependencies]
excel-rs-xlsx = { version = "0.5.1", path = "crates/excel-rs-xlsx", default-features = false }
excel-rs-csv = { version = "0.5.1", path = "crates/excel-rs-csv", default-features = false }
excel-rs-postgres = { version = "0.5.1", path = "crates/excel-rs-postgres", default-features = false }
excel-rs-xlsx = { version = "0.5.2", path = "crates/excel-rs-xlsx", default-features = false }
excel-rs-csv = { version = "0.5.2", path = "crates/excel-rs-csv", default-features = false }
excel-rs-postgres = { version = "0.5.2", path = "crates/excel-rs-postgres", default-features = false }

[profile.release]
opt-level = 3
Expand Down
12 changes: 10 additions & 2 deletions benchmarks/test-py-excel-rs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import py_excel_rs
import datetime
import pandas as pd

f = open('organizations-1000000.csv', 'rb')
xlsx = py_excel_rs.csv_to_xlsx(f.read())
# f = open('organizations-1000000.csv', 'rb')
# xlsx = py_excel_rs.csv_to_xlsx(f.read())


data = [[datetime.datetime.now(), "hello", 10, 10.888]]
df = pd.DataFrame(data, columns=["Date", "hi", "number1", "float2"])

xlsx = py_excel_rs.df_to_xlsx(df, should_infer_types=True)

with open('report.xlsx', 'wb') as f:
f.write(xlsx)
1 change: 1 addition & 0 deletions crates/excel-rs-xlsx/src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ impl<W: Write + Seek> XlsxFormatter<W> {
</cellStyleXfs>
<cellXfs count="1">
<xf numFmtId="0" fontId="0" fillId="0" borderId="0" xfId="0"/>
<xf numFmtId="14" borderId="0" fillId="0" fontId="0" xfId="0"/>
</cellXfs>
<cellStyles count="1">
<cellStyle name="Normal" xfId="0" builtinId="0"/>
Expand Down
1 change: 1 addition & 0 deletions crates/excel-rs-xlsx/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
mod format;
pub mod workbook;
pub mod sheet;
pub mod typed_sheet;

pub use workbook::WorkBook;

Expand Down
204 changes: 204 additions & 0 deletions crates/excel-rs-xlsx/src/typed_sheet.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
use std::{
collections::VecDeque,
io::{Seek, Write},
};

use anyhow::Result;
use zip::{write::SimpleFileOptions, ZipWriter};

pub struct TypedSheet<'a, W: Write + Seek> {
pub sheet_buf: &'a mut ZipWriter<W>,
pub _name: String,
// pub id: u16,
// pub is_closed: bool,
col_num_to_letter: Vec<Vec<u8>>,
current_row_num: u32,
}

impl<'a, W: Write + Seek> TypedSheet<'a, W> {
pub fn new(name: String, id: u16, writer: &'a mut ZipWriter<W>) -> Self {
let options = SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Deflated)
.compression_level(Some(1))
.large_file(true);

writer
.start_file(format!("xl/worksheets/sheet{}.xml", id), options)
.ok();

// Writes Sheet Header
writer.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\">\n<sheetData>\n").ok();

TypedSheet {
sheet_buf: writer,
// id,
_name: name,
// is_closed: false,
col_num_to_letter: Vec::with_capacity(64),
current_row_num: 0,
}
}

// TOOD: Use ShortVec over Vec for cell ID
pub fn write_row(&mut self, data: Vec<&[u8]>, types: &Vec<&str>) -> Result<()> {
self.current_row_num += 1;

let mut final_vec = Vec::with_capacity(512 * data.len());

// TODO: Proper Error Handling
let (row_in_chars_arr, digits) = self.num_to_bytes(self.current_row_num);

final_vec.write(b"<row r=\"")?;
final_vec.write(&row_in_chars_arr[9 - digits..])?;
final_vec.write(b"\">")?;

let mut col = 0;
if self.current_row_num == 1 {
for datum in data {
let (ref_id, pos) = self.ref_id(col, (row_in_chars_arr, digits))?;

final_vec.write(b"<c r=\"")?;
final_vec.write(&ref_id.as_slice()[0..pos])?;
final_vec.write(b"\" t=\"str\"><v>")?;

let (mut chars, chars_pos) = self.escape_in_place(datum);
let mut current_pos = 0;
for char_pos in chars_pos {
final_vec.write(&datum[current_pos..char_pos])?;
final_vec.write(chars.pop_front().unwrap())?;
current_pos = char_pos + 1;
}

final_vec.write(&datum[current_pos..])?;
final_vec.write(b"</v></c>")?;

col += 1;
}
} else {
for datum in data {
let (ref_id, pos) = self.ref_id(col, (row_in_chars_arr, digits))?;

let col_type = *types.get(col).unwrap_or(&"s");

final_vec.write(b"<c r=\"")?;
final_vec.write(&ref_id.as_slice()[0..pos])?;
final_vec.write(b"\" t=\"")?;
final_vec.write(col_type.as_bytes())?;
final_vec.write(b"\"><v>")?;

let (mut chars, chars_pos) = self.escape_in_place(datum);
let mut current_pos = 0;
for char_pos in chars_pos {
final_vec.write(&datum[current_pos..char_pos])?;
final_vec.write(chars.pop_front().unwrap())?;
current_pos = char_pos + 1;
}

final_vec.write(&datum[current_pos..])?;
final_vec.write(b"</v></c>")?;

col += 1;
}
}

final_vec.write(b"</row>")?;

self.sheet_buf.write(&final_vec)?;

Ok(())
}

fn escape_in_place(&self, bytes: &[u8]) -> (VecDeque<&[u8]>, VecDeque<usize>) {
let mut special_chars: VecDeque<&[u8]> = VecDeque::new();
let mut special_char_pos: VecDeque<usize> = VecDeque::new();
let len = bytes.len();
for x in 0..len {
let _ = match bytes[x] {
b'<' => {
special_chars.push_back(b"&lt;".as_slice());
special_char_pos.push_back(x);
}
b'>' => {
special_chars.push_back(b"&gt;".as_slice());
special_char_pos.push_back(x);
}
b'\'' => {
special_chars.push_back(b"&apos;".as_slice());
special_char_pos.push_back(x);
}
b'&' => {
special_chars.push_back(b"&amp;".as_slice());
special_char_pos.push_back(x);
}
b'"' => {
special_chars.push_back(b"&quot;".as_slice());
special_char_pos.push_back(x);
}
_ => (),
};
}

(special_chars, special_char_pos)
}

pub fn close(&mut self) -> Result<()> {
self.sheet_buf.write(b"\n</sheetData>\n</worksheet>\n")?;
Ok(())
}

fn num_to_bytes(&self, n: u32) -> ([u8; 9], usize) {
// Convert from number to string manually
let mut row_in_chars_arr: [u8; 9] = [0; 9];
let mut row = n;
let mut char_pos = 8;
let mut digits = 0;
while row > 0 {
row_in_chars_arr[char_pos] = b'0' + (row % 10) as u8;
row = row / 10;
char_pos -= 1;
digits += 1;
}

(row_in_chars_arr, digits)
}

fn ref_id(&mut self, col: usize, row: ([u8; 9], usize)) -> Result<([u8; 12], usize)> {
let mut final_arr: [u8; 12] = [0; 12];
let letter = self.col_to_letter(col);

let mut pos: usize = 0;
for c in letter {
final_arr[pos] = *c;
pos += 1;
}

let (row_in_chars_arr, digits) = row;

for i in 0..digits {
final_arr[pos] = row_in_chars_arr[(8 - digits) + i + 1];
pos += 1;
}

Ok((final_arr, pos))
}

fn col_to_letter(&mut self, col: usize) -> &[u8] {
if self.col_num_to_letter.len() < col + 1 as usize {
let mut result = Vec::with_capacity(2);
let mut col = col as i16;

loop {
result.push(b'A' + (col % 26) as u8);
col = col / 26 - 1;
if col < 0 {
break;
}
}

result.reverse();
self.col_num_to_letter.push(result);
}

&self.col_num_to_letter[col]
}
}
6 changes: 6 additions & 0 deletions crates/excel-rs-xlsx/src/workbook.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use anyhow::Result;
use zip::ZipWriter;

use super::sheet::Sheet;
use super::typed_sheet::TypedSheet;

pub struct WorkBook<W: Write + Seek> {
formatter: XlsxFormatter<W>,
Expand All @@ -24,6 +25,11 @@ impl<W: Write + Seek> WorkBook<W> {
self.num_of_sheets += 1;
Sheet::new(name, self.num_of_sheets, &mut self.formatter.zip_writer)
}

pub fn get_typed_worksheet(&mut self, name: String) -> TypedSheet<W> {
self.num_of_sheets += 1;
TypedSheet::new(name, self.num_of_sheets, &mut self.formatter.zip_writer)
}

pub fn finish(self) -> Result<W> {
let result = self.formatter.finish(self.num_of_sheets)?;
Expand Down
5 changes: 3 additions & 2 deletions py-excel-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ description = "python ffi for excel-rs"
excel-rs-xlsx = { workspace = true }
excel-rs-csv = { workspace = true }
excel-rs-postgres = { workspace = true }
pyo3 = { version = "0.21", features = ["extension-module"] }
pyo3 = { version = "0.21", features = ["chrono", "extension-module"] }
numpy = "0.21"
chrono = "0.4.38"

[lib]
name = "excel_rs"
crate-type = ["cdylib"]
crate-type = ["cdylib"]
25 changes: 24 additions & 1 deletion py-excel-rs/py_excel_rs/df_to_xlsx.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,36 @@
import pandas as pd
import numpy as np
from enum import Enum

from py_excel_rs import _excel_rs

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_numeric_dtype as is_numeric

class CellTypes(Enum):
Date = "n\" s=\"1"
String = "str"
Number = "n"
Formula = "str"
Boolean = "b"

def csv_to_xlsx(buf: bytes) -> bytes:
return _excel_rs.csv_to_xlsx(buf)

def df_to_xlsx(df: pd.DataFrame) -> bytes:
def df_to_xlsx(df: pd.DataFrame, should_infer_types: bool = False) -> bytes:

py_list = np.vstack((df.keys().to_numpy(), df.to_numpy(dtype='object')))

if should_infer_types:
df_types = []
for x in df.dtypes:
if is_datetime(x):
df_types.append(CellTypes.Date)
elif is_numeric(x):
df_types.append(CellTypes.Number)
else:
df_types.append(CellTypes.String)
return _excel_rs.typed_py_2d_to_xlsx(py_list, list(map(lambda x : x.value, df_types)))
return _excel_rs.py_2d_to_xlsx(py_list)

def pg_to_xlsx(query: str, conn_string: str) -> bytes:
Expand Down
2 changes: 1 addition & 1 deletion py-excel-rs/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "py-excel-rs"
version = "0.5.1"
version = "0.5.2"
description = "Some performant utility functions to convert common data structures to XLSX"
dependencies = ["pandas", "numpy"]
requires-python = ">=3.7"
Expand Down
Loading

0 comments on commit 6dc2389

Please sign in to comment.