Skip to content

Commit

Permalink
split: fix bug with large arguments to -C
Browse files Browse the repository at this point in the history
Fix the behavior of `split -C` when given large arguments. Before this
commit, bytes were being greedily assigned to a chunk too aggressively,
leading to a situation where a split happened in the middle of a line
even though the entire line could have fit within the next chunk. This
was appearing for large arguments to `-C` and long lines that extended
beyond the size of the read buffer. This commit fixes the behavior.

Fixes #7026
  • Loading branch information
jfinkels committed Jan 12, 2025
1 parent da40b35 commit 03c2e53
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 216 deletions.
325 changes: 109 additions & 216 deletions src/uu/split/src/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -919,204 +919,6 @@ impl Write for LineChunkWriter<'_> {
}
}

/// Write lines to each sequential output files, limited by bytes.
///
/// This struct maintains an underlying writer representing the
/// current chunk of the output. On each call to [`write`], it writes
/// as many lines as possible to the current chunk without exceeding
/// the specified byte limit. If a single line has more bytes than the
/// limit, then fill an entire single chunk with those bytes and
/// handle the remainder of the line as if it were its own distinct
/// line. As many new underlying writers are created as needed to
/// write all the data in the input buffer.
struct LineBytesChunkWriter<'a> {
/// Parameters for creating the underlying writer for each new chunk.
settings: &'a Settings,

/// The maximum number of bytes allowed for a single chunk of output.
chunk_size: u64,

/// Running total of number of chunks that have been completed.
num_chunks_written: usize,

/// Remaining capacity in number of bytes in the current chunk.
///
/// This number starts at `chunk_size` and decreases as lines are
/// written. Once it reaches zero, a writer for a new chunk is
/// initialized and this number gets reset to `chunk_size`.
num_bytes_remaining_in_current_chunk: usize,

/// The underlying writer for the current chunk.
///
/// Once the number of bytes written to this writer exceeds
/// `chunk_size`, a new writer is initialized and assigned to this
/// field.
inner: BufWriter<Box<dyn Write>>,

/// Iterator that yields filenames for each chunk.
filename_iterator: FilenameIterator<'a>,
}

impl<'a> LineBytesChunkWriter<'a> {
fn new(chunk_size: u64, settings: &'a Settings) -> UResult<Self> {
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
if settings.verbose {
println!("creating file {}", filename.quote());
}
let inner = settings.instantiate_current_writer(&filename, true)?;
Ok(LineBytesChunkWriter {
settings,
chunk_size,
num_bytes_remaining_in_current_chunk: usize::try_from(chunk_size).unwrap(),
num_chunks_written: 0,
inner,
filename_iterator,
})
}
}

impl Write for LineBytesChunkWriter<'_> {
/// Write as many lines to a chunk as possible without
/// exceeding the byte limit. If a single line has more bytes
/// than the limit, then fill an entire single chunk with those
/// bytes and handle the remainder of the line as if it were
/// its own distinct line.
///
/// For example: if the `chunk_size` is 8 and the input is:
///
/// ```text
/// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
/// ```
///
/// then the output gets broken into chunks like this:
///
/// ```text
/// chunk 0 chunk 1 chunk 2 chunk 3
///
/// 0 1 2
/// 01234567 89 01234 56789 012 345 6
/// |------| |-------| |--------| |---|
/// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
/// ```
///
/// Implements `--line-bytes=SIZE`
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
// The total number of bytes written during the loop below.
//
// It is necessary to keep this running total because we may
// be making multiple calls to `write()` on multiple different
// underlying writers and we want the final reported number of
// bytes written to reflect the total number of bytes written
// to all of the underlying writers.
let mut total_bytes_written = 0;

// Loop until we have written all bytes in the input buffer
// (or an IO error occurs).
loop {
// If the buffer is empty, then we are done writing.
if buf.is_empty() {
return Ok(total_bytes_written);
}

// If we have filled the current chunk with bytes, then
// start a new chunk and initialize its corresponding
// writer.
if self.num_bytes_remaining_in_current_chunk == 0 {
self.num_chunks_written += 1;
let filename = self.filename_iterator.next().ok_or_else(|| {
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
})?;
if self.settings.verbose {
println!("creating file {}", filename.quote());
}
self.inner = self.settings.instantiate_current_writer(&filename, true)?;
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
}

// Find the first separator (default - newline character) in the buffer.
let sep = self.settings.separator;
match memchr::memchr(sep, buf) {
// If there is no separator character and the buffer is
// not empty, then write as many bytes as we can and
// then move on to the next chunk if necessary.
None => {
let end = self.num_bytes_remaining_in_current_chunk;

// This is ugly but here to match GNU behavior. If the input
// doesn't end with a separator, pretend that it does for handling
// the second to last segment chunk. See `line-bytes.sh`.
if end == buf.len()
&& self.num_bytes_remaining_in_current_chunk
< self.chunk_size.try_into().unwrap()
&& buf[buf.len() - 1] != sep
{
self.num_bytes_remaining_in_current_chunk = 0;
} else {
let num_bytes_written = custom_write(
&buf[..end.min(buf.len())],
&mut self.inner,
self.settings,
)?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}
}

// If there is a separator character and the line
// (including the separator character) will fit in the
// current chunk, then write the entire line and
// continue to the next iteration. (See chunk 1 in the
// example comment above.)
Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
let num_bytes_written =
custom_write(&buf[..=i], &mut self.inner, self.settings)?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}

// If there is a separator character, the line
// (including the separator character) will not fit in
// the current chunk, *and* no other lines have been
// written to the current chunk, then write as many
// bytes as we can and continue to the next
// iteration. (See chunk 0 in the example comment
// above.)
Some(_)
if self.num_bytes_remaining_in_current_chunk
== self.chunk_size.try_into().unwrap() =>
{
let end = self.num_bytes_remaining_in_current_chunk;
let num_bytes_written =
custom_write(&buf[..end], &mut self.inner, self.settings)?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}

// If there is a separator character, the line
// (including the separator character) will not fit in
// the current chunk, and at least one other line has
// been written to the current chunk, then signal to
// the next iteration that a new chunk needs to be
// created and continue to the next iteration of the
// loop to try writing the line there.
Some(_) => {
self.num_bytes_remaining_in_current_chunk = 0;
}
}
}
}

fn flush(&mut self) -> std::io::Result<()> {
self.inner.flush()
}
}

/// Output file parameters
struct OutFile {
filename: String,
Expand Down Expand Up @@ -1629,6 +1431,114 @@ where
Ok(())
}

/// Like `std::io::Lines`, but includes the line ending character.
///
/// This struct is generally created by calling `lines_with_sep` on a
/// reader.
pub struct LinesWithSep<R> {
inner: R,
separator: u8,
}

impl<R> Iterator for LinesWithSep<R>
where
R: BufRead,
{
type Item = std::io::Result<Vec<u8>>;

/// Read bytes from a buffer up to the requested number of lines.
fn next(&mut self) -> Option<Self::Item> {
let mut buf = vec![];
match self.inner.read_until(self.separator, &mut buf) {
Ok(0) => None,
Ok(_) => Some(Ok(buf)),
Err(e) => Some(Err(e)),
}
}
}

/// Like `std::str::lines` but includes the line ending character.
///
/// The `separator` defines the character to interpret as the line
/// ending. For the usual notion of "line", set this to `b'\n'`.
pub fn lines_with_sep<R>(reader: R, separator: u8) -> LinesWithSep<R>
where
R: BufRead,
{
LinesWithSep {
inner: reader,
separator,
}
}

fn line_bytes<R>(settings: &Settings, reader: &mut R, chunk_size: usize) -> UResult<()>
where
R: BufRead,
{
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;

// Initialize the writer just to satisfy the compiler. It is going
// to be overwritten for sure at the beginning of the loop below
// because we start with `remaining == 0`, indicating that a new
// chunk should start.
let mut writer: BufWriter<Box<dyn Write>> =
BufWriter::new(Box::new(std::io::Cursor::new(vec![])));

let mut remaining = 0;
for line in lines_with_sep(reader, settings.separator) {
let line = line?;
let mut line = &line[..];
loop {
if remaining == 0 {
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
if settings.verbose {
println!("creating file {}", filename.quote());
}
writer = settings.instantiate_current_writer(&filename, true)?;
remaining = chunk_size;
}

// Special case: if this is the last line and it doesn't end
// with a newline character, then count its length as though
// it did end with a newline. If that puts it over the edge
// of this chunk, continue to the next chunk.
if line.len() == remaining
&& remaining < chunk_size
&& line[line.len() - 1] != settings.separator
{
remaining = 0;
continue;
}

// If the entire line fits in this chunk, write it and
// continue to the next line.
if line.len() <= remaining {
custom_write_all(line, &mut writer, settings)?;
remaining -= line.len();
break;
}

// If the line is too large to fit in *any* chunk and we are
// at the start of a new chunk, write as much as we can of
// it and pass the remainder along to the next chunk.
if line.len() > chunk_size && remaining == chunk_size {
custom_write_all(&line[..chunk_size], &mut writer, settings)?;
line = &line[chunk_size..];
remaining = 0;
continue;
}

// If the line is too large to fit in *this* chunk, but
// might otherwise fit in the next chunk, then just continue
// to the next chunk and let it be handled there.
remaining = 0;
}
}
Ok(())
}

#[allow(clippy::cognitive_complexity)]
fn split(settings: &Settings) -> UResult<()> {
let r_box = if settings.input == "-" {
Expand Down Expand Up @@ -1701,23 +1611,6 @@ fn split(settings: &Settings) -> UResult<()> {
},
}
}
Strategy::LineBytes(chunk_size) => {
let mut writer = LineBytesChunkWriter::new(chunk_size, settings)?;
match std::io::copy(&mut reader, &mut writer) {
Ok(_) => Ok(()),
Err(e) => match e.kind() {
// TODO Since the writer object controls the creation of
// new files, we need to rely on the `std::io::Result`
// returned by its `write()` method to communicate any
// errors to this calling scope. If a new file cannot be
// created because we have exceeded the number of
// allowable filenames, we use `ErrorKind::Other` to
// indicate that. A special error message needs to be
// printed in that case.
ErrorKind::Other => Err(USimpleError::new(1, format!("{e}"))),
_ => Err(uio_error!(e, "input/output error")),
},
}
}
Strategy::LineBytes(chunk_size) => line_bytes(settings, &mut reader, chunk_size as usize),
}
}
17 changes: 17 additions & 0 deletions tests/by-util/test_split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1973,3 +1973,20 @@ fn test_split_separator_same_multiple() {
.args(&["-t:", "-t:", "-t,", "fivelines.txt"])
.fails();
}

#[test]
fn test_long_lines() {
let (at, mut ucmd) = at_and_ucmd!();
let line1 = format!("{:131070}\n", "");
let line2 = format!("{:1}\n", "");
let line3 = format!("{:131071}\n", "");
let infile = [line1, line2, line3].concat();
ucmd.args(&["-C", "131072"])
.pipe_in(infile)
.succeeds()
.no_output();
assert_eq!(at.read("xaa").len(), 131_071);
assert_eq!(at.read("xab").len(), 2);
assert_eq!(at.read("xac").len(), 131_072);
assert!(!at.plus("xad").exists());
}

0 comments on commit 03c2e53

Please sign in to comment.