Skip to content

Commit

Permalink
Fix a bug that wrong parse result with :skip_lines
Browse files Browse the repository at this point in the history
fix GH-296

This is caused by a bug of nested InputsScanner#keep_start/keep_back.
It may back duplicated data when an internal scanner accepts multiple
keep_starts. :skip_lines may cause this situation when a line includes
"\n" and the row separator is "\r\n".

Reported by Ryo Tsukamoto. Thanks!!!
  • Loading branch information
kou committed Mar 22, 2024
1 parent 3ae9194 commit 86de331
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 15 deletions.
9 changes: 9 additions & 0 deletions lib/csv/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,15 @@ def keep_back
end
# trace(__method__, :repos, start, buffer)
@scanner.pos = start
last_scanner, last_start, last_buffer = @keeps.last
# Drop the last buffer when the last buffer is the same data
# in the last keep. If we keep it, we have duplicated data
# by the next keep_back.
if last_scanner == @scanner and
last_buffer and
last_buffer == last_scanner.string.byteslice(last_start, start)
@keeps.last[2] = nil
end
end
read_chunk if @scanner.eos?
end
Expand Down
14 changes: 14 additions & 0 deletions test/csv/parse/test_inputs_scanner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,20 @@
class TestCSVParseInputsScanner < Test::Unit::TestCase
include CSVHelper

def test_scan_keep_nested_back
input = CSV::Parser::UnoptimizedStringIO.new("abcdef")
scanner = CSV::Parser::InputsScanner.new([input],
Encoding::UTF_8,
nil)
scanner.keep_start
assert_equal("abc", scanner.scan_all(/[a-c]+/))
scanner.keep_start
assert_equal("def", scanner.scan_all(/[d-f]+/))
scanner.keep_back
scanner.keep_back
assert_equal("abcdef", scanner.scan_all(/[a-f]+/))
end

def test_scan_keep_over_chunks_nested_back
input = CSV::Parser::UnoptimizedStringIO.new("abcdefghijklmnl")
scanner = CSV::Parser::InputsScanner.new([input],
Expand Down
48 changes: 33 additions & 15 deletions test/csv/parse/test_skip_lines.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ def test_default
assert_nil(csv.skip_lines)
end

def parse(data, **options)
# We use Tempfile here to use CSV::Parser::InputsScanner.
Tempfile.open(["csv-", ".csv"]) do |file|
file.print(data)
file.close
CSV.open(file, **options) do |csv|
csv.read
end
end
end

def test_regexp
csv = <<-CSV
1
Expand All @@ -22,7 +33,7 @@ def test_regexp
["1"],
["4"],
],
CSV.parse(csv, :skip_lines => /\A\s*#/))
parse(csv, :skip_lines => /\A\s*#/))
end

def test_regexp_quoted
Expand All @@ -37,7 +48,7 @@ def test_regexp_quoted
["#3"],
["4"],
],
CSV.parse(csv, :skip_lines => /\A\s*#/))
parse(csv, :skip_lines => /\A\s*#/))
end

def test_string
Expand All @@ -51,7 +62,7 @@ def test_string
["1"],
["4"],
],
CSV.parse(csv, :skip_lines => "."))
parse(csv, :skip_lines => "."))
end

class RegexStub
Expand Down Expand Up @@ -88,7 +99,7 @@ def test_matchable
["1"],
["3"],
],
CSV.parse(csv, :skip_lines => Matchable.new(/\A#/)))
parse(csv, :skip_lines => Matchable.new(/\A#/)))
end

def test_multibyte_data
Expand All @@ -98,29 +109,36 @@ def test_multibyte_data
value = "\u3042\u3044\u3046"
with_chunk_size("5") do
assert_equal([[value], [value]],
CSV.parse("#{value}\n#{value}\n",
:skip_lines => /\A#/))
parse("#{value}\n#{value}\n",
:skip_lines => /\A#/))
end
end

def test_empty_line_and_liberal_parsing
assert_equal([["a", "b"]],
CSV.parse("a,b\n",
:liberal_parsing => true,
:skip_lines => /^$/))
parse("a,b\n",
:liberal_parsing => true,
:skip_lines => /^$/))
end

def test_crlf
assert_equal([["a", "b"]],
CSV.parse("a,b\r\n,\r\n",
:skip_lines => /^,+$/))
parse("a,b\r\n,\r\n",
:skip_lines => /^,+$/))
end

def test_crlf_strip_no_last_crlf
assert_equal([["a"], ["b"]],
CSV.parse("a\r\nb",
row_sep: "\r\n",
skip_lines: /^ *$/,
strip: true))
parse("a\r\nb",
row_sep: "\r\n",
skip_lines: /^ *$/,
strip: true))
end

def test_crlf_quoted_lf
assert_equal([["\n", ""]],
parse("\"\n\",\"\"\r\n",
row_sep: "\r\n",
skip_lines: /not matched/))
end
end

0 comments on commit 86de331

Please sign in to comment.