Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto filter freeze [DNM] #3

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ repository = "https://github.com/carlvoller/excel-rs"
excel-rs-xlsx = { version = "0.5.3", path = "crates/excel-rs-xlsx", default-features = false }
excel-rs-csv = { version = "0.5.3", path = "crates/excel-rs-csv", default-features = false }
excel-rs-postgres = { version = "0.5.3", path = "crates/excel-rs-postgres", default-features = false }
chrono = "0.4"

[profile.release]
opt-level = 3
Expand Down
44 changes: 35 additions & 9 deletions cli-excel-rs/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{fs::File, io::{Cursor, Read, Write}};

use clap::{arg, Command};
use excel_rs_csv::{bytes_to_csv, get_headers, get_next_record};
use excel_rs_xlsx::WorkBook;
use excel_rs_xlsx::{WorkBook, typed_sheet::{TYPE_STRING}};

fn cli() -> Command {
Command::new("excel-rs")
Expand All @@ -13,7 +13,8 @@ fn cli() -> Command {
Command::new("csv")
.about("Convert a csv file to xlsx")
.arg(arg!(--in <FILE> "csv file to convert"))
.arg(arg!(--out <FILE> "xlsx output file name")),
.arg(arg!(--out <FILE> "xlsx output file name"))
.arg(arg!(--filter "Freeze the top row and add auto-filters")),
)
}

Expand All @@ -25,30 +26,55 @@ fn main() {
let input = sub_matches.get_one::<String>("in").expect("required");
let out = sub_matches.get_one::<String>("out").expect("required");

let apply_filter = sub_matches.get_flag("filter");

let mut f = File::open(input).expect("input csv file not found");
let mut data: Vec<u8> = Vec::new();

f.read_to_end(&mut data).expect(&format!("Unable to read file {input}"));

let output_buffer = vec![];
let mut workbook = WorkBook::new(Cursor::new(output_buffer));
let mut worksheet = workbook.get_worksheet(String::from("Sheet 1"));
let mut worksheet = workbook.get_typed_worksheet(String::from("Sheet 1"));

// Apply filters first if requested
if apply_filter {
worksheet.freeze_top_row();
worksheet.add_auto_filter();
}

// Initialize the sheet before writing any rows
worksheet.init_sheet().expect("Failed to initialize worksheet");

let mut reader = bytes_to_csv(data.as_slice());
let headers = get_headers(&mut reader);

if headers.is_some() {
let headers_to_bytes = headers.unwrap().iter().to_owned().collect();
if let Err(e) = worksheet.write_row(headers_to_bytes) {
// Write headers with string types if present
if let Some(headers) = headers {
let headers_to_bytes = headers.iter().to_owned().collect();
let header_types = vec![TYPE_STRING; headers.len()];
if let Err(e) = worksheet.write_row(headers_to_bytes, &header_types) {
panic!("{e}");
}
}

while let Some(record) = get_next_record(&mut reader) {
let row_data = record.iter().to_owned().collect();
if let Err(e) = worksheet.write_row(row_data) {
// Get first data row to infer types
if let Some(record) = get_next_record(&mut reader) {
let row_data: Vec<&[u8]> = record.iter().to_owned().collect();
// Infer types from this row
let types = worksheet.infer_row_types(&row_data);
// Write the row using inferred types
if let Err(e) = worksheet.write_row(row_data, &types) {
panic!("{e}");
}

// Write remaining rows using the same types
while let Some(record) = get_next_record(&mut reader) {
let row_data = record.iter().to_owned().collect();
if let Err(e) = worksheet.write_row(row_data, &types) {
panic!("{e}");
}
}
}

if let Err(e) = worksheet.close() {
Expand Down
1 change: 1 addition & 0 deletions crates/excel-rs-xlsx/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ zip = { version = "2.2.0", default-features = false, features = [
"deflate-zlib-ng",
] }
anyhow = "1.0.86"
chrono = { workspace = true }
83 changes: 73 additions & 10 deletions crates/excel-rs-xlsx/src/sheet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ pub struct Sheet<'a, W: Write + Seek> {
// pub id: u16,
// pub is_closed: bool,
col_num_to_letter: Vec<Vec<u8>>,
current_row_num: u32
current_row_num: u32,
has_auto_filter: bool,
sheet_data_started: bool, // Add this to track if we've started sheetData
freeze_top_row: bool, // Add this to track if we should freeze the top row
}


impl<'a, W: Write + Seek> Sheet<'a, W> {
pub fn new(name: String, id: u16, writer: &'a mut ZipWriter<W>) -> Self {
let options = SimpleFileOptions::default()
Expand All @@ -27,22 +29,59 @@ impl<'a, W: Write + Seek> Sheet<'a, W> {
.start_file(format!("xl/worksheets/sheet{}.xml", id), options)
.ok();

// Writes Sheet Header
writer.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\">\n<sheetData>\n").ok();

writer.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" \
xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\">\n").ok();

Sheet {
sheet_buf: writer,
// id,
_name: name,
// is_closed: false,
col_num_to_letter: Vec::with_capacity(64),
current_row_num: 0
current_row_num: 0,
has_auto_filter: false,
sheet_data_started: false,
freeze_top_row: false,
}
}

// Public method to set the freeze flag
pub fn freeze_top_row(&mut self) {
self.freeze_top_row = true;
}

// Private method to write the sheetViews XML
fn write_sheet_views(&mut self) -> Result<()> {
if self.sheet_data_started {
return Ok(()); // Can't write sheetViews after sheetData has started
}

self.sheet_buf.write(b"<sheetViews>\n\
<sheetView tabSelected=\"1\" workbookViewId=\"0\" zoomScale=\"100\">\n\
<pane ySplit=\"1\" xSplit=\"0\" topLeftCell=\"A2\" activePane=\"bottomLeft\" state=\"frozen\" />\n\
<selection pane=\"topLeft\" />\n\
<selection pane=\"bottomLeft\" activeCell=\"A2\" sqref=\"A2\" />\n\
</sheetView>\n\
</sheetViews>\n")?;

self.sheet_data_started = true;

Ok(())
}

// New public method to initialize the sheet
pub fn init_sheet(&mut self) -> Result<()> {
// Write sheetViews if requested
if self.freeze_top_row {
self.write_sheet_views()?;
}
// Write sheetData start tag
self.sheet_buf.write(b"<sheetData>\n")?;
Ok(())
}

// TOOD: Use ShortVec over Vec for cell ID
pub fn write_row(&mut self, data: Vec<&[u8]>) -> Result<()> {
self.current_row_num += 1;
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is done in typed_sheet.rs but not here. Not incrementing this keeps the next write a little smaller, especially in my 500K row file. But for me, not having that there consistently crashed Excel (probably due to my other changes). This could explain some of the slow down I'm seeing.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I just realised I might've missed out this line, which coincidentally helped speed up the writing significantly. While making this library I've stumbled across many different unusual and unexpected behaviours of excel. It seems that if you only add data and no other modifiers to an xlsx sheet file, Excel will actually fix the row numbers for you automatically which is why I didn't catch this bug myself.

With that being said, there are a number of "excel bugs" I exploited to get this to write as quickly as possible (such as forcing all types to be "str"). I understand that this is a dealbreaker for a number of use cases which is why typed_sheet was created to solve this. In typed_sheet, you accept a (somewhat slight) performance hit in exchange for accurate typing of cell data.

Honestly, I'm inclined to keep this behaviour for the normal sheet.rs. As such, if you're having trouble working around the invalid or missing row numbers, I would recommend you build this feature on top of typed_sheet.rs instead. I have a use case that requires me to write millions of rows and hundreds of columns, and simply adding the row numbers adds close to 45 seconds in my tests.

All the best and I hope this helps clear up some confusion!


let mut final_vec = Vec::with_capacity(512 * data.len());

// TODO: Proper Error Handling
Expand Down Expand Up @@ -115,16 +154,40 @@ impl<'a, W: Write + Seek> Sheet<'a, W> {
}

pub fn close(&mut self) -> Result<()> {
self.sheet_buf.write(b"\n</sheetData>\n</worksheet>\n")?;
// Close sheetData
self.sheet_buf.write(b"</sheetData>\n")?;

// Write autoFilter if requested
if self.has_auto_filter {
let num_columns = self.col_num_to_letter.len();
if num_columns > 0 {
let last_col_letter = self.col_to_letter(num_columns - 1);
let auto_filter_range = format!("A1:{}1", String::from_utf8_lossy(last_col_letter));
self.sheet_buf.write(format!("<autoFilter ref=\"{}\"/>\n", auto_filter_range).as_bytes())?;
}
}

// Close worksheet
self.sheet_buf.write(b"</worksheet>")?;
Ok(())
}

pub fn add_auto_filter(&mut self) {
self.has_auto_filter = true;
}

fn num_to_bytes(&self, n: u32) -> ([u8; 9], usize) {
// Convert from number to string manually
let mut row_in_chars_arr: [u8; 9] = [0; 9];
let mut row = n;
let mut char_pos = 8;
let mut digits = 0;

if row == 0 {
row_in_chars_arr[8] = b'0';
return (row_in_chars_arr, 1);
}

while row > 0 {
row_in_chars_arr[char_pos] = b'0' + (row % 10) as u8;
row = row / 10;
Expand Down
92 changes: 81 additions & 11 deletions crates/excel-rs-xlsx/src/typed_sheet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,20 @@ use std::{

use anyhow::Result;
use zip::{write::SimpleFileOptions, ZipWriter};
use chrono::NaiveDateTime;

pub const TYPE_NUMBER: &'static str = "n";
pub const TYPE_DATE: &'static str = "d";
pub const TYPE_STRING: &'static str = "str";

pub struct TypedSheet<'a, W: Write + Seek> {
pub sheet_buf: &'a mut ZipWriter<W>,
pub _name: String,
// pub id: u16,
// pub is_closed: bool,
col_num_to_letter: Vec<Vec<u8>>,
current_row_num: u32,
has_auto_filter: bool,
sheet_data_started: bool,
freeze_top_row: bool,
}

impl<'a, W: Write + Seek> TypedSheet<'a, W> {
Expand All @@ -26,26 +32,59 @@ impl<'a, W: Write + Seek> TypedSheet<'a, W> {
.start_file(format!("xl/worksheets/sheet{}.xml", id), options)
.ok();

// Writes Sheet Header
writer.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\">\n<sheetData>\n").ok();
writer.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n\
<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" \
xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\">\n").ok();

TypedSheet {
sheet_buf: writer,
// id,
_name: name,
// is_closed: false,
col_num_to_letter: Vec::with_capacity(64),
current_row_num: 0,
has_auto_filter: false,
sheet_data_started: false,
freeze_top_row: false,
}
}

pub fn freeze_top_row(&mut self) {
self.freeze_top_row = true;
}

pub fn add_auto_filter(&mut self) {
self.has_auto_filter = true;
}

fn write_sheet_views(&mut self) -> Result<()> {
if self.sheet_data_started {
return Ok(());
}

self.sheet_buf.write(b"<sheetViews>\n\
<sheetView tabSelected=\"1\" workbookViewId=\"0\" zoomScale=\"100\">\n\
<pane ySplit=\"1\" xSplit=\"0\" topLeftCell=\"A2\" activePane=\"bottomLeft\" state=\"frozen\" />\n\
<selection pane=\"topLeft\" />\n\
<selection pane=\"bottomLeft\" activeCell=\"A2\" sqref=\"A2\" />\n\
</sheetView>\n\
</sheetViews>\n")?;

Ok(())
}

pub fn init_sheet(&mut self) -> Result<()> {
if self.freeze_top_row {
self.write_sheet_views()?;
}
self.sheet_buf.write(b"<sheetData>\n")?;
self.sheet_data_started = true;
Ok(())
}

// TOOD: Use ShortVec over Vec for cell ID
pub fn write_row(&mut self, data: Vec<&[u8]>, types: &Vec<&str>) -> Result<()> {
pub fn write_row(&mut self, data: Vec<&[u8]>, types: &Vec<&'static str>) -> Result<()> {
self.current_row_num += 1;

let mut final_vec = Vec::with_capacity(512 * data.len());

// TODO: Proper Error Handling
let (row_in_chars_arr, digits) = self.num_to_bytes(self.current_row_num);

final_vec.write(b"<row r=\"")?;
Expand Down Expand Up @@ -108,6 +147,27 @@ impl<'a, W: Write + Seek> TypedSheet<'a, W> {
Ok(())
}

pub fn infer_row_types(&self, data: &[&[u8]]) -> Vec<&'static str> {
data.iter()
.map(|field| {
let s = String::from_utf8_lossy(field);
if s.parse::<i64>().is_ok() {
TYPE_NUMBER
} else if s.parse::<f64>().is_ok() {
TYPE_NUMBER
} else if let Ok(_) = NaiveDateTime::parse_from_str(&s, "%Y-%m-%d") {
TYPE_DATE
} else if let Ok(_) = NaiveDateTime::parse_from_str(&s, "%m/%d/%Y") {
TYPE_DATE
} else if let Ok(_) = NaiveDateTime::parse_from_str(&s, "%d/%m/%Y") {
TYPE_DATE
} else {
TYPE_STRING
}
})
.collect()
}

fn escape_in_place(&self, bytes: &[u8]) -> (VecDeque<&[u8]>, VecDeque<usize>) {
let mut special_chars: VecDeque<&[u8]> = VecDeque::new();
let mut special_char_pos: VecDeque<usize> = VecDeque::new();
Expand Down Expand Up @@ -142,12 +202,22 @@ impl<'a, W: Write + Seek> TypedSheet<'a, W> {
}

pub fn close(&mut self) -> Result<()> {
self.sheet_buf.write(b"\n</sheetData>\n</worksheet>\n")?;
self.sheet_buf.write(b"</sheetData>\n")?;

if self.has_auto_filter {
let num_columns = self.col_num_to_letter.len();
if num_columns > 0 {
let last_col_letter = self.col_to_letter(num_columns - 1);
let auto_filter_range = format!("A1:{}1", String::from_utf8_lossy(last_col_letter));
self.sheet_buf.write(format!("<autoFilter ref=\"{}\"/>\n", auto_filter_range).as_bytes())?;
}
}

self.sheet_buf.write(b"</worksheet>")?;
Ok(())
}

fn num_to_bytes(&self, n: u32) -> ([u8; 9], usize) {
// Convert from number to string manually
let mut row_in_chars_arr: [u8; 9] = [0; 9];
let mut row = n;
let mut char_pos = 8;
Expand Down
Loading