From cd0c2a67c482c441ac7f0a07c0f81573d6b6072f Mon Sep 17 00:00:00 2001 From: Hiroshi SHIBATA Date: Fri, 26 Aug 2022 14:53:21 +0900 Subject: [PATCH] Merge csv-3.2.4 --- lib/csv.rb | 24 ++++++--- lib/csv/fields_converter.rb | 5 +- lib/csv/parser.rb | 78 ++++++++++++++++------------- lib/csv/row.rb | 2 +- lib/csv/version.rb | 2 +- lib/csv/writer.rb | 10 ++-- test/csv/parse/test_convert.rb | 59 ++++++++++++++++++++++ test/csv/test_data_converters.rb | 84 ++++++++++++++++++++++++++++++++ test/csv/test_encodings.rb | 31 ++++++++++++ 9 files changed, 244 insertions(+), 51 deletions(-) diff --git a/lib/csv.rb b/lib/csv.rb index 31e46d91ed..20cfda4b41 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -95,14 +95,11 @@ require "stringio" require_relative "csv/fields_converter" require_relative "csv/input_record_separator" -require_relative "csv/match_p" require_relative "csv/parser" require_relative "csv/row" require_relative "csv/table" require_relative "csv/writer" -using CSV::MatchP if CSV.const_defined?(:MatchP) - # == \CSV # # === In a Hurry? @@ -866,8 +863,9 @@ class CSV # index:: The zero-based index of the field in its row. # line:: The line of the data source this row is from. # header:: The header for the column, when available. + # quoted?:: True or false, whether the original value is quoted or not. # - FieldInfo = Struct.new(:index, :line, :header) + FieldInfo = Struct.new(:index, :line, :header, :quoted?) # A Regexp used to find and convert some common Date formats. DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} | @@ -875,10 +873,9 @@ class CSV # A Regexp used to find and convert some common DateTime formats. DateTimeMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4} | - \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} | - # ISO-8601 + # ISO-8601 and RFC-3339 (space instead of T) recognized by DateTime.parse \d{4}-\d{2}-\d{2} - (?:T\d{2}:\d{2}(?::\d{2}(?:\.\d+)?(?:[+-]\d{2}(?::\d{2})|Z)?)?)? + (?:[T\s]\d{2}:\d{2}(?::\d{2}(?:\.\d+)?(?:[+-]\d{2}(?::\d{2})|Z)?)?)? )\z /x # The encoding used by all converters. @@ -1893,8 +1890,19 @@ class CSV raise ArgumentError.new("Cannot parse nil as CSV") if data.nil? if data.is_a?(String) + if encoding + if encoding.is_a?(String) + data_external_encoding, data_internal_encoding = encoding.split(":", 2) + if data_internal_encoding + data = data.encode(data_internal_encoding, data_external_encoding) + else + data = data.dup.force_encoding(data_external_encoding) + end + else + data = data.dup.force_encoding(encoding) + end + end @io = StringIO.new(data) - @io.set_encoding(encoding || data.encoding) else @io = data end diff --git a/lib/csv/fields_converter.rb b/lib/csv/fields_converter.rb index b206118d99..d15977d379 100644 --- a/lib/csv/fields_converter.rb +++ b/lib/csv/fields_converter.rb @@ -44,7 +44,7 @@ class CSV @converters.empty? end - def convert(fields, headers, lineno) + def convert(fields, headers, lineno, quoted_fields) return fields unless need_convert? fields.collect.with_index do |field, index| @@ -63,7 +63,8 @@ class CSV else header = nil end - field = converter[field, FieldInfo.new(index, lineno, header)] + quoted = quoted_fields[index] + field = converter[field, FieldInfo.new(index, lineno, header, quoted)] end break unless field.is_a?(String) # short-circuit pipeline for speed end diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb index 7fe5f0d3ab..afb3131cd5 100644 --- a/lib/csv/parser.rb +++ b/lib/csv/parser.rb @@ -2,15 +2,10 @@ require "strscan" -require_relative "delete_suffix" require_relative "input_record_separator" -require_relative "match_p" require_relative "row" require_relative "table" -using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix) -using CSV::MatchP if CSV.const_defined?(:MatchP) - class CSV # Note: Don't use this class directly. This is an internal class. class Parser @@ -763,9 +758,10 @@ class CSV case headers when Array @raw_headers = headers + quoted_fields = [false] * @raw_headers.size @use_headers = true when String - @raw_headers = parse_headers(headers) + @raw_headers, quoted_fields = parse_headers(headers) @use_headers = true when nil, false @raw_headers = nil @@ -775,21 +771,28 @@ class CSV @use_headers = true end if @raw_headers - @headers = adjust_headers(@raw_headers) + @headers = adjust_headers(@raw_headers, quoted_fields) else @headers = nil end end def parse_headers(row) - CSV.parse_line(row, - col_sep: @column_separator, - row_sep: @row_separator, - quote_char: @quote_character) + quoted_fields = [] + converter = lambda do |field, info| + quoted_fields << info.quoted? + field + end + headers = CSV.parse_line(row, + col_sep: @column_separator, + row_sep: @row_separator, + quote_char: @quote_character, + converters: [converter]) + [headers, quoted_fields] end - def adjust_headers(headers) - adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno) + def adjust_headers(headers, quoted_fields) + adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields) adjusted_headers.each {|h| h.freeze if h.is_a? String} adjusted_headers end @@ -933,9 +936,11 @@ class CSV if line.empty? next if @skip_blanks row = [] + quoted_fields = [] else line = strip_value(line) row = line.split(@split_column_separator, -1) + quoted_fields = [false] * row.size if @max_field_size row.each do |column| validate_field_size(column) @@ -949,7 +954,7 @@ class CSV end end @last_line = original_line - emit_row(row, &block) + emit_row(row, quoted_fields, &block) end end @@ -971,25 +976,30 @@ class CSV next end row = [] + quoted_fields = [] elsif line.include?(@cr) or line.include?(@lf) @scanner.keep_back @need_robust_parsing = true return parse_quotable_robust(&block) else row = line.split(@split_column_separator, -1) + quoted_fields = [] n_columns = row.size i = 0 while i < n_columns column = row[i] if column.empty? + quoted_fields << false row[i] = nil else n_quotes = column.count(@quote_character) if n_quotes.zero? + quoted_fields << false # no quote elsif n_quotes == 2 and column.start_with?(@quote_character) and column.end_with?(@quote_character) + quoted_fields << true row[i] = column[1..-2] else @scanner.keep_back @@ -1004,13 +1014,14 @@ class CSV @scanner.keep_drop @scanner.keep_start @last_line = original_line - emit_row(row, &block) + emit_row(row, quoted_fields, &block) end @scanner.keep_drop end def parse_quotable_robust(&block) row = [] + quoted_fields = [] skip_needless_lines start_row while true @@ -1024,20 +1035,24 @@ class CSV end if parse_column_end row << value + quoted_fields << @quoted_column_value elsif parse_row_end if row.empty? and value.nil? - emit_row([], &block) unless @skip_blanks + emit_row([], [], &block) unless @skip_blanks else row << value - emit_row(row, &block) + quoted_fields << @quoted_column_value + emit_row(row, quoted_fields, &block) row = [] + quoted_fields = [] end skip_needless_lines start_row elsif @scanner.eos? break if row.empty? and value.nil? row << value - emit_row(row, &block) + quoted_fields << @quoted_column_value + emit_row(row, quoted_fields, &block) break else if @quoted_column_value @@ -1141,7 +1156,7 @@ class CSV if (n_quotes % 2).zero? quotes[0, (n_quotes - 2) / 2] else - value = quotes[0, (n_quotes - 1) / 2] + value = quotes[0, n_quotes / 2] while true quoted_value = @scanner.scan_all(@quoted_value) value << quoted_value if quoted_value @@ -1165,11 +1180,9 @@ class CSV n_quotes = quotes.size if n_quotes == 1 break - elsif (n_quotes % 2) == 1 - value << quotes[0, (n_quotes - 1) / 2] - break else value << quotes[0, n_quotes / 2] + break if (n_quotes % 2) == 1 end end value @@ -1205,18 +1218,15 @@ class CSV def strip_value(value) return value unless @strip - return nil if value.nil? + return value if value.nil? case @strip when String - size = value.size - while value.start_with?(@strip) - size -= 1 - value = value[1, size] + while value.delete_prefix!(@strip) + # do nothing end - while value.end_with?(@strip) - size -= 1 - value = value[0, size] + while value.delete_suffix!(@strip) + # do nothing end else value.strip! @@ -1239,22 +1249,22 @@ class CSV @scanner.keep_start end - def emit_row(row, &block) + def emit_row(row, quoted_fields, &block) @lineno += 1 raw_row = row if @use_headers if @headers.nil? - @headers = adjust_headers(row) + @headers = adjust_headers(row, quoted_fields) return unless @return_headers row = Row.new(@headers, row, true) else row = Row.new(@headers, - @fields_converter.convert(raw_row, @headers, @lineno)) + @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields)) end else # convert fields, if needed... - row = @fields_converter.convert(raw_row, nil, @lineno) + row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields) end # inject unconverted fields and accessor, if requested... diff --git a/lib/csv/row.rb b/lib/csv/row.rb index 62e429fc6e..500adb1882 100644 --- a/lib/csv/row.rb +++ b/lib/csv/row.rb @@ -703,7 +703,7 @@ class CSV # by +index_or_header+ and +specifiers+. # # The nested objects may be instances of various classes. - # See {Dig Methods}[https://docs.ruby-lang.org/en/master/doc/dig_methods_rdoc.html]. + # See {Dig Methods}[https://docs.ruby-lang.org/en/master/dig_methods_rdoc.html]. # # Examples: # source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n" diff --git a/lib/csv/version.rb b/lib/csv/version.rb index edab1c31e0..eaddde9a23 100644 --- a/lib/csv/version.rb +++ b/lib/csv/version.rb @@ -2,5 +2,5 @@ class CSV # The version of the installed library. - VERSION = "3.2.3" + VERSION = "3.2.4" end diff --git a/lib/csv/writer.rb b/lib/csv/writer.rb index 4a9a35c5af..030a295bc9 100644 --- a/lib/csv/writer.rb +++ b/lib/csv/writer.rb @@ -1,11 +1,8 @@ # frozen_string_literal: true require_relative "input_record_separator" -require_relative "match_p" require_relative "row" -using CSV::MatchP if CSV.const_defined?(:MatchP) - class CSV # Note: Don't use this class directly. This is an internal class. class Writer @@ -42,7 +39,10 @@ class CSV @headers ||= row if @use_headers @lineno += 1 - row = @fields_converter.convert(row, nil, lineno) if @fields_converter + if @fields_converter + quoted_fields = [false] * row.size + row = @fields_converter.convert(row, nil, lineno, quoted_fields) + end i = -1 converted_row = row.collect do |field| @@ -97,7 +97,7 @@ class CSV return unless @headers converter = @options[:header_fields_converter] - @headers = converter.convert(@headers, nil, 0) + @headers = converter.convert(@headers, nil, 0, []) @headers.each do |header| header.freeze if header.is_a?(String) end diff --git a/test/csv/parse/test_convert.rb b/test/csv/parse/test_convert.rb index 21d9f20b28..2ac255695e 100644 --- a/test/csv/parse/test_convert.rb +++ b/test/csv/parse/test_convert.rb @@ -107,4 +107,63 @@ class TestCSVParseConvert < Test::Unit::TestCase assert_equal([nil, "empty", "a"], CSV.parse_line(',"",a', empty_value: "empty")) end + + sub_test_case("#quoted?") do + def setup + @preserving_converter = lambda do |field, info| + f = field.encode(CSV::ConverterEncoding) + return f if info.quoted? + begin + Integer(f, 10) + rescue + f + end + end + + @quoted_header_converter = lambda do |field, info| + f = field.encode(CSV::ConverterEncoding) + return f if info.quoted? + f.to_sym + end + end + + def test_parse_line + row = CSV.parse_line('1,"2",3', converters: @preserving_converter) + assert_equal([1, "2", 3], row) + end + + def test_parse + expected = [["quoted", "unquoted"], ["109", 1], ["10A", 2]] + rows = CSV.parse(<<~CSV, converters: @preserving_converter) + "quoted",unquoted + "109",1 + "10A",2 + CSV + assert_equal(expected, rows) + end + + def test_alternating_quote + row = CSV.parse_line('"1",2,"3"', converters: @preserving_converter) + assert_equal(['1', 2, '3'], row) + end + + def test_parse_headers + expected = [["quoted", :unquoted], ["109", "1"], ["10A", "2"]] + table = CSV.parse(<<~CSV, headers: true, header_converters: @quoted_header_converter) + "quoted",unquoted + "109",1 + "10A",2 + CSV + assert_equal(expected, table.to_a) + end + + def test_parse_with_string_headers + expected = [["quoted", :unquoted], %w[109 1], %w[10A 2]] + table = CSV.parse(<<~CSV, headers: '"quoted",unquoted', header_converters: @quoted_header_converter) + "109",1 + "10A",2 + CSV + assert_equal(expected, table.to_a) + end + end end diff --git a/test/csv/test_data_converters.rb b/test/csv/test_data_converters.rb index 1620e077be..c20a5d1f4b 100644 --- a/test/csv/test_data_converters.rb +++ b/test/csv/test_data_converters.rb @@ -103,4 +103,88 @@ class TestCSVDataConverters < Test::Unit::TestCase assert_equal(datetime, CSV::Converters[:date_time][iso8601_string]) end + + def test_builtin_date_time_converter_rfc3339_minute + rfc3339_string = "2018-01-14 22:25" + datetime = DateTime.new(2018, 1, 14, 22, 25) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_second + rfc3339_string = "2018-01-14 22:25:19" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_under_second + rfc3339_string = "2018-01-14 22:25:19.1" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_under_second_offset + rfc3339_string = "2018-01-14 22:25:19.1+09:00" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1, "+9") + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_offset + rfc3339_string = "2018-01-14 22:25:19+09:00" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19, "+9") + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_utc + rfc3339_string = "2018-01-14 22:25:19Z" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_tab_minute + rfc3339_string = "2018-01-14\t22:25" + datetime = DateTime.new(2018, 1, 14, 22, 25) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_tab_second + rfc3339_string = "2018-01-14\t22:25:19" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_tab_under_second + rfc3339_string = "2018-01-14\t22:25:19.1" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_tab_under_second_offset + rfc3339_string = "2018-01-14\t22:25:19.1+09:00" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1, "+9") + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_tab_offset + rfc3339_string = "2018-01-14\t22:25:19+09:00" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19, "+9") + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end + + def test_builtin_date_time_converter_rfc3339_tab_utc + rfc3339_string = "2018-01-14\t22:25:19Z" + datetime = DateTime.new(2018, 1, 14, 22, 25, 19) + assert_equal(datetime, + CSV::Converters[:date_time][rfc3339_string]) + end end diff --git a/test/csv/test_encodings.rb b/test/csv/test_encodings.rb index 8d228c05f3..f08d551f69 100644 --- a/test/csv/test_encodings.rb +++ b/test/csv/test_encodings.rb @@ -288,6 +288,37 @@ class TestCSVEncodings < Test::Unit::TestCase error.message) end + def test_string_input_transcode + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + # U+3046 HIRAGANA LETTER U + value = "\u3042\u3044\u3046" + csv = CSV.new(value, encoding: "UTF-8:EUC-JP") + assert_equal([[value.encode("EUC-JP")]], + csv.read) + end + + def test_string_input_set_encoding_string + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + # U+3046 HIRAGANA LETTER U + value = "\u3042\u3044\u3046".encode("EUC-JP") + csv = CSV.new(value.dup.force_encoding("UTF-8"), encoding: "EUC-JP") + assert_equal([[value.encode("EUC-JP")]], + csv.read) + end + + def test_string_input_set_encoding_encoding + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + # U+3046 HIRAGANA LETTER U + value = "\u3042\u3044\u3046".encode("EUC-JP") + csv = CSV.new(value.dup.force_encoding("UTF-8"), + encoding: Encoding.find("EUC-JP")) + assert_equal([[value.encode("EUC-JP")]], + csv.read) + end + private def assert_parses(fields, encoding, **options)