Merge csv-3.2.4

This commit is contained in:
Hiroshi SHIBATA 2022-08-26 14:53:21 +09:00 committed by nagachika
parent c69fffe67d
commit cd0c2a67c4
9 changed files with 244 additions and 51 deletions

View file

@ -95,14 +95,11 @@ require "stringio"
require_relative "csv/fields_converter" require_relative "csv/fields_converter"
require_relative "csv/input_record_separator" require_relative "csv/input_record_separator"
require_relative "csv/match_p"
require_relative "csv/parser" require_relative "csv/parser"
require_relative "csv/row" require_relative "csv/row"
require_relative "csv/table" require_relative "csv/table"
require_relative "csv/writer" require_relative "csv/writer"
using CSV::MatchP if CSV.const_defined?(:MatchP)
# == \CSV # == \CSV
# #
# === In a Hurry? # === In a Hurry?
@ -866,8 +863,9 @@ class CSV
# <b><tt>index</tt></b>:: The zero-based index of the field in its row. # <b><tt>index</tt></b>:: The zero-based index of the field in its row.
# <b><tt>line</tt></b>:: The line of the data source this row is from. # <b><tt>line</tt></b>:: The line of the data source this row is from.
# <b><tt>header</tt></b>:: The header for the column, when available. # <b><tt>header</tt></b>:: The header for the column, when available.
# <b><tt>quoted?</tt></b>:: True or false, whether the original value is quoted or not.
# #
FieldInfo = Struct.new(:index, :line, :header) FieldInfo = Struct.new(:index, :line, :header, :quoted?)
# A Regexp used to find and convert some common Date formats. # A Regexp used to find and convert some common Date formats.
DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} | DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} |
@ -875,10 +873,9 @@ class CSV
# A Regexp used to find and convert some common DateTime formats. # A Regexp used to find and convert some common DateTime formats.
DateTimeMatcher = DateTimeMatcher =
/ \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4} | / \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4} |
\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} | # ISO-8601 and RFC-3339 (space instead of T) recognized by DateTime.parse
# ISO-8601
\d{4}-\d{2}-\d{2} \d{4}-\d{2}-\d{2}
(?:T\d{2}:\d{2}(?::\d{2}(?:\.\d+)?(?:[+-]\d{2}(?::\d{2})|Z)?)?)? (?:[T\s]\d{2}:\d{2}(?::\d{2}(?:\.\d+)?(?:[+-]\d{2}(?::\d{2})|Z)?)?)?
)\z /x )\z /x
# The encoding used by all converters. # The encoding used by all converters.
@ -1893,8 +1890,19 @@ class CSV
raise ArgumentError.new("Cannot parse nil as CSV") if data.nil? raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
if data.is_a?(String) if data.is_a?(String)
if encoding
if encoding.is_a?(String)
data_external_encoding, data_internal_encoding = encoding.split(":", 2)
if data_internal_encoding
data = data.encode(data_internal_encoding, data_external_encoding)
else
data = data.dup.force_encoding(data_external_encoding)
end
else
data = data.dup.force_encoding(encoding)
end
end
@io = StringIO.new(data) @io = StringIO.new(data)
@io.set_encoding(encoding || data.encoding)
else else
@io = data @io = data
end end

View file

@ -44,7 +44,7 @@ class CSV
@converters.empty? @converters.empty?
end end
def convert(fields, headers, lineno) def convert(fields, headers, lineno, quoted_fields)
return fields unless need_convert? return fields unless need_convert?
fields.collect.with_index do |field, index| fields.collect.with_index do |field, index|
@ -63,7 +63,8 @@ class CSV
else else
header = nil header = nil
end end
field = converter[field, FieldInfo.new(index, lineno, header)] quoted = quoted_fields[index]
field = converter[field, FieldInfo.new(index, lineno, header, quoted)]
end end
break unless field.is_a?(String) # short-circuit pipeline for speed break unless field.is_a?(String) # short-circuit pipeline for speed
end end

View file

@ -2,15 +2,10 @@
require "strscan" require "strscan"
require_relative "delete_suffix"
require_relative "input_record_separator" require_relative "input_record_separator"
require_relative "match_p"
require_relative "row" require_relative "row"
require_relative "table" require_relative "table"
using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
using CSV::MatchP if CSV.const_defined?(:MatchP)
class CSV class CSV
# Note: Don't use this class directly. This is an internal class. # Note: Don't use this class directly. This is an internal class.
class Parser class Parser
@ -763,9 +758,10 @@ class CSV
case headers case headers
when Array when Array
@raw_headers = headers @raw_headers = headers
quoted_fields = [false] * @raw_headers.size
@use_headers = true @use_headers = true
when String when String
@raw_headers = parse_headers(headers) @raw_headers, quoted_fields = parse_headers(headers)
@use_headers = true @use_headers = true
when nil, false when nil, false
@raw_headers = nil @raw_headers = nil
@ -775,21 +771,28 @@ class CSV
@use_headers = true @use_headers = true
end end
if @raw_headers if @raw_headers
@headers = adjust_headers(@raw_headers) @headers = adjust_headers(@raw_headers, quoted_fields)
else else
@headers = nil @headers = nil
end end
end end
def parse_headers(row) def parse_headers(row)
CSV.parse_line(row, quoted_fields = []
converter = lambda do |field, info|
quoted_fields << info.quoted?
field
end
headers = CSV.parse_line(row,
col_sep: @column_separator, col_sep: @column_separator,
row_sep: @row_separator, row_sep: @row_separator,
quote_char: @quote_character) quote_char: @quote_character,
converters: [converter])
[headers, quoted_fields]
end end
def adjust_headers(headers) def adjust_headers(headers, quoted_fields)
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno) adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
adjusted_headers.each {|h| h.freeze if h.is_a? String} adjusted_headers.each {|h| h.freeze if h.is_a? String}
adjusted_headers adjusted_headers
end end
@ -933,9 +936,11 @@ class CSV
if line.empty? if line.empty?
next if @skip_blanks next if @skip_blanks
row = [] row = []
quoted_fields = []
else else
line = strip_value(line) line = strip_value(line)
row = line.split(@split_column_separator, -1) row = line.split(@split_column_separator, -1)
quoted_fields = [false] * row.size
if @max_field_size if @max_field_size
row.each do |column| row.each do |column|
validate_field_size(column) validate_field_size(column)
@ -949,7 +954,7 @@ class CSV
end end
end end
@last_line = original_line @last_line = original_line
emit_row(row, &block) emit_row(row, quoted_fields, &block)
end end
end end
@ -971,25 +976,30 @@ class CSV
next next
end end
row = [] row = []
quoted_fields = []
elsif line.include?(@cr) or line.include?(@lf) elsif line.include?(@cr) or line.include?(@lf)
@scanner.keep_back @scanner.keep_back
@need_robust_parsing = true @need_robust_parsing = true
return parse_quotable_robust(&block) return parse_quotable_robust(&block)
else else
row = line.split(@split_column_separator, -1) row = line.split(@split_column_separator, -1)
quoted_fields = []
n_columns = row.size n_columns = row.size
i = 0 i = 0
while i < n_columns while i < n_columns
column = row[i] column = row[i]
if column.empty? if column.empty?
quoted_fields << false
row[i] = nil row[i] = nil
else else
n_quotes = column.count(@quote_character) n_quotes = column.count(@quote_character)
if n_quotes.zero? if n_quotes.zero?
quoted_fields << false
# no quote # no quote
elsif n_quotes == 2 and elsif n_quotes == 2 and
column.start_with?(@quote_character) and column.start_with?(@quote_character) and
column.end_with?(@quote_character) column.end_with?(@quote_character)
quoted_fields << true
row[i] = column[1..-2] row[i] = column[1..-2]
else else
@scanner.keep_back @scanner.keep_back
@ -1004,13 +1014,14 @@ class CSV
@scanner.keep_drop @scanner.keep_drop
@scanner.keep_start @scanner.keep_start
@last_line = original_line @last_line = original_line
emit_row(row, &block) emit_row(row, quoted_fields, &block)
end end
@scanner.keep_drop @scanner.keep_drop
end end
def parse_quotable_robust(&block) def parse_quotable_robust(&block)
row = [] row = []
quoted_fields = []
skip_needless_lines skip_needless_lines
start_row start_row
while true while true
@ -1024,20 +1035,24 @@ class CSV
end end
if parse_column_end if parse_column_end
row << value row << value
quoted_fields << @quoted_column_value
elsif parse_row_end elsif parse_row_end
if row.empty? and value.nil? if row.empty? and value.nil?
emit_row([], &block) unless @skip_blanks emit_row([], [], &block) unless @skip_blanks
else else
row << value row << value
emit_row(row, &block) quoted_fields << @quoted_column_value
emit_row(row, quoted_fields, &block)
row = [] row = []
quoted_fields = []
end end
skip_needless_lines skip_needless_lines
start_row start_row
elsif @scanner.eos? elsif @scanner.eos?
break if row.empty? and value.nil? break if row.empty? and value.nil?
row << value row << value
emit_row(row, &block) quoted_fields << @quoted_column_value
emit_row(row, quoted_fields, &block)
break break
else else
if @quoted_column_value if @quoted_column_value
@ -1141,7 +1156,7 @@ class CSV
if (n_quotes % 2).zero? if (n_quotes % 2).zero?
quotes[0, (n_quotes - 2) / 2] quotes[0, (n_quotes - 2) / 2]
else else
value = quotes[0, (n_quotes - 1) / 2] value = quotes[0, n_quotes / 2]
while true while true
quoted_value = @scanner.scan_all(@quoted_value) quoted_value = @scanner.scan_all(@quoted_value)
value << quoted_value if quoted_value value << quoted_value if quoted_value
@ -1165,11 +1180,9 @@ class CSV
n_quotes = quotes.size n_quotes = quotes.size
if n_quotes == 1 if n_quotes == 1
break break
elsif (n_quotes % 2) == 1
value << quotes[0, (n_quotes - 1) / 2]
break
else else
value << quotes[0, n_quotes / 2] value << quotes[0, n_quotes / 2]
break if (n_quotes % 2) == 1
end end
end end
value value
@ -1205,18 +1218,15 @@ class CSV
def strip_value(value) def strip_value(value)
return value unless @strip return value unless @strip
return nil if value.nil? return value if value.nil?
case @strip case @strip
when String when String
size = value.size while value.delete_prefix!(@strip)
while value.start_with?(@strip) # do nothing
size -= 1
value = value[1, size]
end end
while value.end_with?(@strip) while value.delete_suffix!(@strip)
size -= 1 # do nothing
value = value[0, size]
end end
else else
value.strip! value.strip!
@ -1239,22 +1249,22 @@ class CSV
@scanner.keep_start @scanner.keep_start
end end
def emit_row(row, &block) def emit_row(row, quoted_fields, &block)
@lineno += 1 @lineno += 1
raw_row = row raw_row = row
if @use_headers if @use_headers
if @headers.nil? if @headers.nil?
@headers = adjust_headers(row) @headers = adjust_headers(row, quoted_fields)
return unless @return_headers return unless @return_headers
row = Row.new(@headers, row, true) row = Row.new(@headers, row, true)
else else
row = Row.new(@headers, row = Row.new(@headers,
@fields_converter.convert(raw_row, @headers, @lineno)) @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
end end
else else
# convert fields, if needed... # convert fields, if needed...
row = @fields_converter.convert(raw_row, nil, @lineno) row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
end end
# inject unconverted fields and accessor, if requested... # inject unconverted fields and accessor, if requested...

View file

@ -703,7 +703,7 @@ class CSV
# by +index_or_header+ and +specifiers+. # by +index_or_header+ and +specifiers+.
# #
# The nested objects may be instances of various classes. # The nested objects may be instances of various classes.
# See {Dig Methods}[https://docs.ruby-lang.org/en/master/doc/dig_methods_rdoc.html]. # See {Dig Methods}[https://docs.ruby-lang.org/en/master/dig_methods_rdoc.html].
# #
# Examples: # Examples:
# source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n" # source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"

View file

@ -2,5 +2,5 @@
class CSV class CSV
# The version of the installed library. # The version of the installed library.
VERSION = "3.2.3" VERSION = "3.2.4"
end end

View file

@ -1,11 +1,8 @@
# frozen_string_literal: true # frozen_string_literal: true
require_relative "input_record_separator" require_relative "input_record_separator"
require_relative "match_p"
require_relative "row" require_relative "row"
using CSV::MatchP if CSV.const_defined?(:MatchP)
class CSV class CSV
# Note: Don't use this class directly. This is an internal class. # Note: Don't use this class directly. This is an internal class.
class Writer class Writer
@ -42,7 +39,10 @@ class CSV
@headers ||= row if @use_headers @headers ||= row if @use_headers
@lineno += 1 @lineno += 1
row = @fields_converter.convert(row, nil, lineno) if @fields_converter if @fields_converter
quoted_fields = [false] * row.size
row = @fields_converter.convert(row, nil, lineno, quoted_fields)
end
i = -1 i = -1
converted_row = row.collect do |field| converted_row = row.collect do |field|
@ -97,7 +97,7 @@ class CSV
return unless @headers return unless @headers
converter = @options[:header_fields_converter] converter = @options[:header_fields_converter]
@headers = converter.convert(@headers, nil, 0) @headers = converter.convert(@headers, nil, 0, [])
@headers.each do |header| @headers.each do |header|
header.freeze if header.is_a?(String) header.freeze if header.is_a?(String)
end end

View file

@ -107,4 +107,63 @@ class TestCSVParseConvert < Test::Unit::TestCase
assert_equal([nil, "empty", "a"], assert_equal([nil, "empty", "a"],
CSV.parse_line(',"",a', empty_value: "empty")) CSV.parse_line(',"",a', empty_value: "empty"))
end end
sub_test_case("#quoted?") do
def setup
@preserving_converter = lambda do |field, info|
f = field.encode(CSV::ConverterEncoding)
return f if info.quoted?
begin
Integer(f, 10)
rescue
f
end
end
@quoted_header_converter = lambda do |field, info|
f = field.encode(CSV::ConverterEncoding)
return f if info.quoted?
f.to_sym
end
end
def test_parse_line
row = CSV.parse_line('1,"2",3', converters: @preserving_converter)
assert_equal([1, "2", 3], row)
end
def test_parse
expected = [["quoted", "unquoted"], ["109", 1], ["10A", 2]]
rows = CSV.parse(<<~CSV, converters: @preserving_converter)
"quoted",unquoted
"109",1
"10A",2
CSV
assert_equal(expected, rows)
end
def test_alternating_quote
row = CSV.parse_line('"1",2,"3"', converters: @preserving_converter)
assert_equal(['1', 2, '3'], row)
end
def test_parse_headers
expected = [["quoted", :unquoted], ["109", "1"], ["10A", "2"]]
table = CSV.parse(<<~CSV, headers: true, header_converters: @quoted_header_converter)
"quoted",unquoted
"109",1
"10A",2
CSV
assert_equal(expected, table.to_a)
end
def test_parse_with_string_headers
expected = [["quoted", :unquoted], %w[109 1], %w[10A 2]]
table = CSV.parse(<<~CSV, headers: '"quoted",unquoted', header_converters: @quoted_header_converter)
"109",1
"10A",2
CSV
assert_equal(expected, table.to_a)
end
end
end end

View file

@ -103,4 +103,88 @@ class TestCSVDataConverters < Test::Unit::TestCase
assert_equal(datetime, assert_equal(datetime,
CSV::Converters[:date_time][iso8601_string]) CSV::Converters[:date_time][iso8601_string])
end end
def test_builtin_date_time_converter_rfc3339_minute
rfc3339_string = "2018-01-14 22:25"
datetime = DateTime.new(2018, 1, 14, 22, 25)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_second
rfc3339_string = "2018-01-14 22:25:19"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_under_second
rfc3339_string = "2018-01-14 22:25:19.1"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_under_second_offset
rfc3339_string = "2018-01-14 22:25:19.1+09:00"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1, "+9")
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_offset
rfc3339_string = "2018-01-14 22:25:19+09:00"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19, "+9")
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_utc
rfc3339_string = "2018-01-14 22:25:19Z"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_tab_minute
rfc3339_string = "2018-01-14\t22:25"
datetime = DateTime.new(2018, 1, 14, 22, 25)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_tab_second
rfc3339_string = "2018-01-14\t22:25:19"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_tab_under_second
rfc3339_string = "2018-01-14\t22:25:19.1"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_tab_under_second_offset
rfc3339_string = "2018-01-14\t22:25:19.1+09:00"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19.1, "+9")
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_tab_offset
rfc3339_string = "2018-01-14\t22:25:19+09:00"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19, "+9")
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
def test_builtin_date_time_converter_rfc3339_tab_utc
rfc3339_string = "2018-01-14\t22:25:19Z"
datetime = DateTime.new(2018, 1, 14, 22, 25, 19)
assert_equal(datetime,
CSV::Converters[:date_time][rfc3339_string])
end
end end

View file

@ -288,6 +288,37 @@ class TestCSVEncodings < Test::Unit::TestCase
error.message) error.message)
end end
def test_string_input_transcode
# U+3042 HIRAGANA LETTER A
# U+3044 HIRAGANA LETTER I
# U+3046 HIRAGANA LETTER U
value = "\u3042\u3044\u3046"
csv = CSV.new(value, encoding: "UTF-8:EUC-JP")
assert_equal([[value.encode("EUC-JP")]],
csv.read)
end
def test_string_input_set_encoding_string
# U+3042 HIRAGANA LETTER A
# U+3044 HIRAGANA LETTER I
# U+3046 HIRAGANA LETTER U
value = "\u3042\u3044\u3046".encode("EUC-JP")
csv = CSV.new(value.dup.force_encoding("UTF-8"), encoding: "EUC-JP")
assert_equal([[value.encode("EUC-JP")]],
csv.read)
end
def test_string_input_set_encoding_encoding
# U+3042 HIRAGANA LETTER A
# U+3044 HIRAGANA LETTER I
# U+3046 HIRAGANA LETTER U
value = "\u3042\u3044\u3046".encode("EUC-JP")
csv = CSV.new(value.dup.force_encoding("UTF-8"),
encoding: Encoding.find("EUC-JP"))
assert_equal([[value.encode("EUC-JP")]],
csv.read)
end
private private
def assert_parses(fields, encoding, **options) def assert_parses(fields, encoding, **options)