Merge csv-3.2.3

This commit is contained in:
Hiroshi SHIBATA 2022-08-25 10:49:13 +09:00 committed by nagachika
parent a9bf13a4df
commit c69fffe67d
13 changed files with 1431 additions and 364 deletions

View file

@ -27,6 +27,10 @@ class CSV
class InvalidEncoding < StandardError
end
# Raised when unexpected case is happen.
class UnexpectedError < StandardError
end
#
# CSV::Scanner receives a CSV output, scans it and return the content.
# It also controls the life cycle of the object with its methods +keep_start+,
@ -78,10 +82,10 @@ class CSV
# +keep_end+, +keep_back+, +keep_drop+.
#
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
# If there's a match, the scanner advances the “scan pointer” and returns the matched string.
# If there's a match, the scanner advances the "scan pointer" and returns the matched string.
# Otherwise, the scanner returns nil.
#
# CSV::InputsScanner.rest() returns the “rest” of the string (i.e. everything after the scan pointer).
# CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
# If there is no more data (eos? = true), it returns "".
#
class InputsScanner
@ -96,11 +100,13 @@ class CSV
end
def each_line(row_separator)
return enum_for(__method__, row_separator) unless block_given?
buffer = nil
input = @scanner.rest
position = @scanner.pos
offset = 0
n_row_separator_chars = row_separator.size
# trace(__method__, :start, line, input)
while true
input.each_line(row_separator) do |line|
@scanner.pos += line.bytesize
@ -140,25 +146,28 @@ class CSV
end
def scan(pattern)
# trace(__method__, pattern, :start)
value = @scanner.scan(pattern)
# trace(__method__, pattern, :done, :last, value) if @last_scanner
return value if @last_scanner
if value
read_chunk if @scanner.eos?
return value
else
nil
end
read_chunk if value and @scanner.eos?
# trace(__method__, pattern, :done, value)
value
end
def scan_all(pattern)
# trace(__method__, pattern, :start)
value = @scanner.scan(pattern)
# trace(__method__, pattern, :done, :last, value) if @last_scanner
return value if @last_scanner
return nil if value.nil?
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
# trace(__method__, pattern, :sub, sub_value)
value << sub_value
end
# trace(__method__, pattern, :done, value)
value
end
@ -167,68 +176,126 @@ class CSV
end
def keep_start
@keeps.push([@scanner.pos, nil])
# trace(__method__, :start)
adjust_last_keep
@keeps.push([@scanner, @scanner.pos, nil])
# trace(__method__, :done)
end
def keep_end
start, buffer = @keeps.pop
keep = @scanner.string.byteslice(start, @scanner.pos - start)
# trace(__method__, :start)
scanner, start, buffer = @keeps.pop
if scanner == @scanner
keep = @scanner.string.byteslice(start, @scanner.pos - start)
else
keep = @scanner.string.byteslice(0, @scanner.pos)
end
if buffer
buffer << keep
keep = buffer
end
# trace(__method__, :done, keep)
keep
end
def keep_back
start, buffer = @keeps.pop
# trace(__method__, :start)
scanner, start, buffer = @keeps.pop
if buffer
# trace(__method__, :rescan, start, buffer)
string = @scanner.string
keep = string.byteslice(start, string.bytesize - start)
if scanner == @scanner
keep = string.byteslice(start, string.bytesize - start)
else
keep = string
end
if keep and not keep.empty?
@inputs.unshift(StringIO.new(keep))
@last_scanner = false
end
@scanner = StringScanner.new(buffer)
else
if @scanner != scanner
message = "scanners are different but no buffer: "
message += "#{@scanner.inspect}(#{@scanner.object_id}): "
message += "#{scanner.inspect}(#{scanner.object_id})"
raise UnexpectedError, message
end
# trace(__method__, :repos, start, buffer)
@scanner.pos = start
end
read_chunk if @scanner.eos?
end
def keep_drop
@keeps.pop
_, _, buffer = @keeps.pop
# trace(__method__, :done, :empty) unless buffer
return unless buffer
last_keep = @keeps.last
# trace(__method__, :done, :no_last_keep) unless last_keep
return unless last_keep
if last_keep[2]
last_keep[2] << buffer
else
last_keep[2] = buffer
end
# trace(__method__, :done)
end
def rest
@scanner.rest
end
def check(pattern)
@scanner.check(pattern)
end
private
def trace(*args)
pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
end
def adjust_last_keep
# trace(__method__, :start)
keep = @keeps.last
# trace(__method__, :done, :empty) if keep.nil?
return if keep.nil?
scanner, start, buffer = keep
string = @scanner.string
if @scanner != scanner
start = 0
end
if start == 0 and @scanner.eos?
keep_data = string
else
keep_data = string.byteslice(start, @scanner.pos - start)
end
if keep_data
if buffer
buffer << keep_data
else
keep[2] = keep_data.dup
end
end
# trace(__method__, :done)
end
def read_chunk
return false if @last_scanner
unless @keeps.empty?
keep = @keeps.last
keep_start = keep[0]
string = @scanner.string
keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
if keep_data
keep_buffer = keep[1]
if keep_buffer
keep_buffer << keep_data
else
keep[1] = keep_data.dup
end
end
keep[0] = 0
end
adjust_last_keep
input = @inputs.first
case input
when StringIO
string = input.read
raise InvalidEncoding unless string.valid_encoding?
# trace(__method__, :stringio, string)
@scanner = StringScanner.new(string)
@inputs.shift
@last_scanner = @inputs.empty?
@ -237,6 +304,7 @@ class CSV
chunk = input.gets(@row_separator, @chunk_size)
if chunk
raise InvalidEncoding unless chunk.valid_encoding?
# trace(__method__, :chunk, chunk)
@scanner = StringScanner.new(chunk)
if input.respond_to?(:eof?) and input.eof?
@inputs.shift
@ -244,6 +312,7 @@ class CSV
end
true
else
# trace(__method__, :no_chunk)
@scanner = StringScanner.new("".encode(@encoding))
@inputs.shift
@last_scanner = @inputs.empty?
@ -278,7 +347,11 @@ class CSV
end
def field_size_limit
@field_size_limit
@max_field_size&.succ
end
def max_field_size
@max_field_size
end
def skip_lines
@ -346,6 +419,16 @@ class CSV
end
message = "Invalid byte sequence in #{@encoding}"
raise MalformedCSVError.new(message, lineno)
rescue UnexpectedError => error
if @scanner
ignore_broken_line
lineno = @lineno
else
lineno = @lineno + 1
end
message = "This should not be happen: #{error.message}: "
message += "Please report this to https://github.com/ruby/csv/issues"
raise MalformedCSVError.new(message, lineno)
end
end
@ -390,7 +473,7 @@ class CSV
@backslash_quote = false
end
@unconverted_fields = @options[:unconverted_fields]
@field_size_limit = @options[:field_size_limit]
@max_field_size = @options[:max_field_size]
@skip_blanks = @options[:skip_blanks]
@fields_converter = @options[:fields_converter]
@header_fields_converter = @options[:header_fields_converter]
@ -729,28 +812,28 @@ class CSV
sample[0, 128].index(@quote_character)
end
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
if SCANNER_TEST
class UnoptimizedStringIO
def initialize(string)
@io = StringIO.new(string, "rb:#{string.encoding}")
end
def gets(*args)
@io.gets(*args)
end
def each_line(*args, &block)
@io.each_line(*args, &block)
end
def eof?
@io.eof?
end
class UnoptimizedStringIO # :nodoc:
def initialize(string)
@io = StringIO.new(string, "rb:#{string.encoding}")
end
SCANNER_TEST_CHUNK_SIZE =
Integer((ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"), 10)
def gets(*args)
@io.gets(*args)
end
def each_line(*args, &block)
@io.each_line(*args, &block)
end
def eof?
@io.eof?
end
end
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
if SCANNER_TEST
SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
def build_scanner
inputs = @samples.collect do |sample|
UnoptimizedStringIO.new(sample)
@ -760,10 +843,17 @@ class CSV
else
inputs << @input
end
begin
chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
rescue # Ractor::IsolationError
# Ractor on Ruby 3.0 can't read ENV value.
chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
end
chunk_size = Integer((chunk_size_value || "1"), 10)
InputsScanner.new(inputs,
@encoding,
@row_separator,
chunk_size: SCANNER_TEST_CHUNK_SIZE)
chunk_size: chunk_size)
end
else
def build_scanner
@ -826,6 +916,14 @@ class CSV
end
end
def validate_field_size(field)
return unless @max_field_size
return if field.size <= @max_field_size
ignore_broken_line
message = "Field size exceeded: #{field.size} > #{@max_field_size}"
raise MalformedCSVError.new(message, @lineno)
end
def parse_no_quote(&block)
@scanner.each_line(@row_separator) do |line|
next if @skip_lines and skip_line?(line)
@ -838,6 +936,11 @@ class CSV
else
line = strip_value(line)
row = line.split(@split_column_separator, -1)
if @max_field_size
row.each do |column|
validate_field_size(column)
end
end
n_columns = row.size
i = 0
while i < n_columns
@ -893,6 +996,7 @@ class CSV
@need_robust_parsing = true
return parse_quotable_robust(&block)
end
validate_field_size(row[i])
end
i += 1
end
@ -916,10 +1020,7 @@ class CSV
value = parse_column_value
if value
@scanner.scan_all(@strip_value) if @strip_value
if @field_size_limit and value.size >= @field_size_limit
ignore_broken_line
raise MalformedCSVError.new("Field size exceeded", @lineno)
end
validate_field_size(value)
end
if parse_column_end
row << value
@ -940,8 +1041,14 @@ class CSV
break
else
if @quoted_column_value
if liberal_parsing? and (new_line = @scanner.check(@line_end))
message =
"Illegal end-of-line sequence outside of a quoted field " +
"<#{new_line.inspect}>"
else
message = "Any value after quoted field isn't allowed"
end
ignore_broken_line
message = "Any value after quoted field isn't allowed"
raise MalformedCSVError.new(message, @lineno)
elsif @unquoted_column_value and
(new_line = @scanner.scan(@line_end))