From e75132e36ce7b240eded202bff4c631621027999 Mon Sep 17 00:00:00 2001 From: Vladimir Kochnev Date: Thu, 11 Jul 2024 21:15:32 +0000 Subject: [PATCH] Avoid dynamic parse method dispatch for faster access (#311) On some benchmarks it seems to make a difference: - `quoted` from `benchmark/parse.yaml` - `quote_char_nil` from `benchmark/parse_quote_char_nil.yaml` ``` N_ROWS=5000 rake benchmark:parse benchmark:parse_liberal_parsing benchmark:parse_quote_char_nil benchmark:parse_strip ``` ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/vladimirkochnev/.asdf/installs/ruby/3.3.3/bin/ruby -v -S benchmark-driver /Users/vladimirkochnev/code/csv/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23] Calculating ------------------------------------- csv 3.3.0 master unquoted 22.147 22.131 i/s - 100.000 times in 4.515187s 4.518589s quoted 11.517 12.997 i/s - 100.000 times in 8.682986s 7.694298s mixed 14.097 13.964 i/s - 100.000 times in 7.093660s 7.161389s include_col_sep 5.214 5.188 i/s - 100.000 times in 19.178537s 19.277059s include_row_sep 5.195 5.101 i/s - 100.000 times in 19.250419s 19.605061s encode_utf-8 16.030 15.984 i/s - 100.000 times in 6.238449s 6.256427s encode_sjis 16.546 16.376 i/s - 100.000 times in 6.043603s 6.106603s Comparison: unquoted csv 3.3.0: 22.1 i/s master: 22.1 i/s - 1.00x slower quoted master: 13.0 i/s csv 3.3.0: 11.5 i/s - 1.13x slower mixed csv 3.3.0: 14.1 i/s master: 14.0 i/s - 1.01x slower include_col_sep csv 3.3.0: 5.2 i/s master: 5.2 i/s - 1.01x slower include_row_sep csv 3.3.0: 5.2 i/s master: 5.1 i/s - 1.02x slower encode_utf-8 csv 3.3.0: 16.0 i/s master: 16.0 i/s - 1.00x slower encode_sjis csv 3.3.0: 16.5 i/s master: 16.4 i/s - 1.01x slower ``` ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/vladimirkochnev/.asdf/installs/ruby/3.3.3/bin/ruby -v -S benchmark-driver /Users/vladimirkochnev/code/csv/benchmark/parse_liberal_parsing.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23] Calculating ------------------------------------- csv 3.3.0 master unquoted 8.132 8.250 i/s - 100.000 times in 12.297793s 12.121689s unquoted_backslash_quote 3.868 3.866 i/s - 100.000 times in 25.849956s 25.869413s quoted 3.642 3.638 i/s - 100.000 times in 27.454032s 27.484247s quoted_double_quote_outside_quote 2.277 2.202 i/s - 100.000 times in 43.921488s 45.419138s quoted_backslash_quote 1.801 1.803 i/s - 100.000 times in 55.522265s 55.464641s include_col_sep 3.644 3.633 i/s - 100.000 times in 27.440353s 27.528626s include_row_sep 3.629 3.614 i/s - 100.000 times in 27.559354s 27.670274s encode_utf-8 8.149 8.136 i/s - 100.000 times in 12.270936s 12.290646s encode_sjis 8.527 8.425 i/s - 100.000 times in 11.727969s 11.868855s Comparison: unquoted master: 8.2 i/s csv 3.3.0: 8.1 i/s - 1.01x slower unquoted_backslash_quote csv 3.3.0: 3.9 i/s master: 3.9 i/s - 1.00x slower quoted csv 3.3.0: 3.6 i/s master: 3.6 i/s - 1.00x slower quoted_double_quote_outside_quote csv 3.3.0: 2.3 i/s master: 2.2 i/s - 1.03x slower quoted_backslash_quote master: 1.8 i/s csv 3.3.0: 1.8 i/s - 1.00x slower include_col_sep csv 3.3.0: 3.6 i/s master: 3.6 i/s - 1.00x slower include_row_sep csv 3.3.0: 3.6 i/s master: 3.6 i/s - 1.00x slower encode_utf-8 csv 3.3.0: 8.1 i/s master: 8.1 i/s - 1.00x slower encode_sjis csv 3.3.0: 8.5 i/s master: 8.4 i/s - 1.01x slower ``` ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/vladimirkochnev/.asdf/installs/ruby/3.3.3/bin/ruby -v -S benchmark-driver /Users/vladimirkochnev/code/csv/benchmark/parse_quote_char_nil.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23] Calculating ------------------------------------- csv 3.3.0 master without_quote_char 22.840 22.844 i/s - 100.000 times in 4.378284s 4.377488s quote_char_nil 32.370 43.729 i/s - 100.000 times in 3.089285s 2.286831s col_sep_space 12.135 12.106 i/s - 100.000 times in 8.240368s 8.260030s Comparison: without_quote_char master: 22.8 i/s csv 3.3.0: 22.8 i/s - 1.00x slower quote_char_nil master: 43.7 i/s csv 3.3.0: 32.4 i/s - 1.35x slower col_sep_space csv 3.3.0: 12.1 i/s master: 12.1 i/s - 1.00x slower ``` ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/vladimirkochnev/.asdf/installs/ruby/3.3.3/bin/ruby -v -S benchmark-driver /Users/vladimirkochnev/code/csv/benchmark/parse_strip.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin23] Calculating ------------------------------------- csv 3.3.0 master default 13.132 13.043 i/s - 100.000 times in 7.615051s 7.667227s no_quote_strip 8.955 8.957 i/s - 100.000 times in 11.167272s 11.164189s Comparison: default csv 3.3.0: 13.1 i/s master: 13.0 i/s - 1.01x slower no_quote_strip master: 9.0 i/s csv 3.3.0: 9.0 i/s - 1.00x slower ``` --- benchmark/convert_nil.yaml | 2 ++ benchmark/parse.yaml | 2 ++ benchmark/parse_liberal_parsing.yaml | 2 ++ benchmark/parse_quote_char_nil.yaml | 2 ++ benchmark/parse_strip.yaml | 2 ++ benchmark/read.yaml | 2 ++ benchmark/shift.yaml | 2 ++ benchmark/write.yaml | 2 ++ lib/csv/parser.rb | 23 ++++++++++------------- 9 files changed, 26 insertions(+), 13 deletions(-) diff --git a/benchmark/convert_nil.yaml b/benchmark/convert_nil.yaml index f32c6f10..76613806 100644 --- a/benchmark/convert_nil.yaml +++ b/benchmark/convert_nil.yaml @@ -4,6 +4,8 @@ contexts: csv: 3.0.1 - gems: csv: 3.0.2 + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml index 25ccaf24..4289995f 100644 --- a/benchmark/parse.yaml +++ b/benchmark/parse.yaml @@ -4,6 +4,8 @@ contexts: csv: 3.0.1 - gems: csv: 3.0.2 + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/parse_liberal_parsing.yaml b/benchmark/parse_liberal_parsing.yaml index dcbf5985..41307e11 100644 --- a/benchmark/parse_liberal_parsing.yaml +++ b/benchmark/parse_liberal_parsing.yaml @@ -2,6 +2,8 @@ loop_count: 100 contexts: - gems: csv: 3.0.2 + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/parse_quote_char_nil.yaml b/benchmark/parse_quote_char_nil.yaml index f92fd33b..f9cb8283 100644 --- a/benchmark/parse_quote_char_nil.yaml +++ b/benchmark/parse_quote_char_nil.yaml @@ -1,5 +1,7 @@ loop_count: 100 contexts: + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/parse_strip.yaml b/benchmark/parse_strip.yaml index a0230fd1..fe9dbeaa 100644 --- a/benchmark/parse_strip.yaml +++ b/benchmark/parse_strip.yaml @@ -1,5 +1,7 @@ loop_count: 100 contexts: + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/read.yaml b/benchmark/read.yaml index b06dbe16..15fc26f5 100644 --- a/benchmark/read.yaml +++ b/benchmark/read.yaml @@ -4,6 +4,8 @@ contexts: csv: 3.0.1 - gems: csv: 3.0.2 + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/shift.yaml b/benchmark/shift.yaml index eb6fd80e..8011a4fa 100644 --- a/benchmark/shift.yaml +++ b/benchmark/shift.yaml @@ -4,6 +4,8 @@ contexts: csv: 3.0.1 - gems: csv: 3.0.2 + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/benchmark/write.yaml b/benchmark/write.yaml index 5b7d1943..019927c2 100644 --- a/benchmark/write.yaml +++ b/benchmark/write.yaml @@ -4,6 +4,8 @@ contexts: csv: 3.0.1 - gems: csv: 3.0.2 + - gems: + csv: 3.3.0 - name: "master" prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb index d7fcd10e..eda820b7 100644 --- a/lib/csv/parser.rb +++ b/lib/csv/parser.rb @@ -409,13 +409,7 @@ def parse(&block) begin @scanner ||= build_scanner - if quote_character.nil? - parse_no_quote(&block) - elsif @need_robust_parsing - parse_quotable_robust(&block) - else - parse_quotable_loose(&block) - end + __send__(@parse_method, &block) rescue InvalidEncoding if @scanner ignore_broken_line @@ -459,7 +453,6 @@ def prepare end def prepare_variable - @need_robust_parsing = false @encoding = @options[:encoding] liberal_parsing = @options[:liberal_parsing] if liberal_parsing @@ -472,7 +465,6 @@ def prepare_variable @double_quote_outside_quote = false @backslash_quote = false end - @need_robust_parsing = true else @liberal_parsing = false @backslash_quote = false @@ -554,7 +546,6 @@ def prepare_strip @rstrip_value = Regexp.new(@escaped_strip + "+\\z".encode(@encoding)) end - @need_robust_parsing = true elsif @strip strip_values = " \t\f\v" @escaped_strip = strip_values.encode(@encoding) @@ -562,7 +553,6 @@ def prepare_strip @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding)) @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding)) end - @need_robust_parsing = true end end @@ -808,6 +798,13 @@ def adjust_headers(headers, quoted_fields) def prepare_parser @may_quoted = may_quoted? + if @quote_character.nil? + @parse_method = :parse_no_quote + elsif @liberal_parsing or @strip + @parse_method = :parse_quotable_robust + else + @parse_method = :parse_quotable_loose + end end def may_quoted? @@ -987,7 +984,7 @@ def parse_quotable_loose(&block) quoted_fields = [] elsif line.include?(@cr) or line.include?(@lf) @scanner.keep_back - @need_robust_parsing = true + @parse_method = :parse_quotable_robust return parse_quotable_robust(&block) else row = line.split(@split_column_separator, -1) @@ -1011,7 +1008,7 @@ def parse_quotable_loose(&block) row[i] = column[1..-2] else @scanner.keep_back - @need_robust_parsing = true + @parse_method = :parse_quotable_robust return parse_quotable_robust(&block) end validate_field_size(row[i])