ruby/lib/irb/ruby-lex.rb
knu 859a7a9277 Merge from irb 0.7.3.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@1341 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2001-04-30 17:54:55 +00:00

966 lines
17 KiB
Ruby
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# irb/ruby-lex.rb - ruby lexcal analizer
# $Release Version: 0.7.3$
# $Revision$
# $Date$
# by Keiju ISHITSUKA(keiju@ishitsuka.com)
#
# --
#
#
#
require "e2mmap"
require "irb/slex"
require "irb/ruby-token"
class RubyLex
@RCS_ID='-$Id$-'
extend Exception2MessageMapper
def_exception(:AlreadyDefinedToken, "Already defined token(%s)")
def_exception(:TkReading2TokenNoKey, "key nothing(key='%s')")
def_exception(:TkSymbol2TokenNoKey, "key nothing(key='%s')")
def_exception(:TkReading2TokenDuplicateError,
"key duplicate(token_n='%s', key='%s')")
def_exception(:SyntaxError, "%s")
include RubyToken
class << self
attr_accessor :debug_level
def debug?
@debug_level > 0
end
end
@debug_level = 0
def initialize
lex_init
set_input(STDIN)
@seek = 0
@exp_line_no = @line_no = 1
@base_char_no = 0
@char_no = 0
@rests = []
@readed = []
@here_readed = []
@indent = 0
@skip_space = false
@readed_auto_clean_up = false
@exception_on_syntax_error = true
end
attr_accessor :skip_space
attr_accessor :readed_auto_clean_up
attr_accessor :exception_on_syntax_error
attr_reader :seek
attr_reader :char_no
attr_reader :line_no
attr_reader :indent
# io functions
def set_input(io, p = nil)
@io = io
if p.kind_of?(Proc)
@input = p
elsif iterator?
@input = proc
else
@input = proc{@io.gets}
end
end
def get_readed
if idx = @readed.reverse.index("\n")
@base_char_no = idx
else
@base_char_no += @readed.size
end
readed = @readed.join("")
@readed = []
readed
end
def getc
while @rests.empty?
return nil unless buf_input
end
c = @rests.shift
if @here_header
@here_readed.push c
else
@readed.push c
end
@seek += 1
if c == "\n"
@line_no += 1
@char_no = 0
else
@char_no += 1
end
c
end
def gets
l = ""
while c = getc
l.concat c
break if c == "\n"
end
l
end
def eof?
@io.eof?
end
def getc_of_rests
if @rests.empty?
nil
else
getc
end
end
def ungetc(c = nil)
if @here_readed.empty?
c2 = @readed.pop
else
c2 = @here_readed.pop
end
c = c2 unless c
@rests.unshift c #c =
@seek -= 1
if c == "\n"
@line_no -= 1
if idx = @readed.reverse.index("\n")
@char_no = @readed.size - idx
else
@char_no = @base_char_no + @readed.size
end
else
@char_no -= 1
end
end
def peek_equal?(str)
chrs = str.split(//)
until @rests.size >= chrs.size
return false unless buf_input
end
@rests[0, chrs.size] == chrs
end
def peek_match?(regexp)
while @rests.empty?
return false unless buf_input
end
regexp =~ @rests.join("")
end
def peek(i = 0)
while @rests.size <= i
return nil unless buf_input
end
@rests[i]
end
def buf_input
prompt
line = @input.call
return nil unless line
@rests.concat line.split(//)
true
end
private :buf_input
def set_prompt(p = proc)
if p.kind_of?(Proc)
@prompt = p
else
@prompt = proc{print p}
end
end
def prompt
if @prompt
@prompt.call(@ltype, @indent, @continue, @line_no)
end
end
def initialize_input
@ltype = nil
@quoted = nil
@indent = 0
@lex_state = EXPR_BEG
@space_seen = false
@here_header = false
@continue = false
prompt
@line = ""
@exp_line_no = @line_no
end
def each_top_level_statement
initialize_input
loop do
@continue = false
prompt
unless l = lex
break if @line == ''
else
# p l
@line.concat l
if @ltype or @continue or @indent > 0
next
end
end
if @line != "\n"
yield @line, @exp_line_no
end
break unless l
@line = ''
@exp_line_no = @line_no
@indent = 0
prompt
end
end
def lex
until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
!@continue or
tk.nil?)
#p tk
#p self
end
line = get_readed
# print self.inspect
if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
nil
else
line
end
end
def token
# require "tracer"
# Tracer.on
@prev_seek = @seek
@prev_line_no = @line_no
@prev_char_no = @char_no
begin
begin
tk = @OP.match(self)
@space_seen = tk.kind_of?(TkSPACE)
rescue SyntaxError
abort if @exception_on_syntax_error
tk = TkError.new(@seek, @line_no, @char_no)
end
end while @skip_space and tk.kind_of?(TkSPACE)
if @readed_auto_clean_up
get_readed
end
# Tracer.off
tk
end
ENINDENT_CLAUSE = [
"case", "class", "def", "do", "for", "if",
"module", "unless", "until", "while", "begin" #, "when"
]
DEINDENT_CLAUSE = ["end" #, "when"
]
PERCENT_LTYPE = {
"q" => "\'",
"Q" => "\"",
"x" => "\`",
"r" => "\/",
"w" => "]"
}
PERCENT_PAREN = {
"{" => "}",
"[" => "]",
"<" => ">",
"(" => ")"
}
Ltype2Token = {
"\'" => TkSTRING,
"\"" => TkSTRING,
"\`" => TkXSTRING,
"\/" => TkREGEXP,
"]" => TkDSTRING
}
DLtype2Token = {
"\"" => TkDSTRING,
"\`" => TkDXSTRING,
"\/" => TkDREGEXP,
}
def lex_init()
@OP = SLex.new
@OP.def_rules("\0", "\004", "\032") do
Token(TkEND_OF_SCRIPT)
end
@OP.def_rules(" ", "\t", "\f", "\r", "\13") do
@space_seen = true
while getc =~ /[ \t\f\r\13]/; end
ungetc
Token(TkSPACE)
end
@OP.def_rule("#") do
|op, io|
identify_comment
end
@OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do
|op, io|
@ltype = "="
until getc == "\n"; end
until peek_equal?("=end") && peek(4) =~ /\s/
until getc == "\n"; end
end
gets
@ltype = nil
Token(TkRD_COMMENT)
end
@OP.def_rule("\n") do
print "\\n\n" if RubyLex.debug?
case @lex_state
when EXPR_BEG, EXPR_FNAME, EXPR_DOT
@continue = true
else
@continue = false
@lex_state = EXPR_BEG
end
@here_header = false
@here_readed = []
Token(TkNL)
end
@OP.def_rules("*", "**",
"!", "!=", "!~",
"=", "==", "===",
"=~", "<=>",
"<", "<=",
">", ">=", ">>") do
|op, io|
@lex_state = EXPR_BEG
Token(op)
end
@OP.def_rules("<<") do
|op, io|
if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
(@lex_state != EXPR_ARG || @space_seen)
c = peek(0)
if /\S/ =~ c && (/["'`]/ =~ c || /[\w_]/ =~ c || c == "-")
tk = identify_here_document
end
else
tk = Token(op)
end
tk
end
@OP.def_rules("'", '"') do
|op, io|
identify_string(op)
end
@OP.def_rules("`") do
|op, io|
if @lex_state == EXPR_FNAME
Token(op)
else
identify_string(op)
end
end
@OP.def_rules('?') do
|op, io|
if @lex_state == EXPR_END
@lex_state = EXPR_BEG
Token(TkQUESTION)
else
ch = getc
if @lex_state == EXPR_ARG && ch !~ /\s/
ungetc
@lex_state = EXPR_BEG;
Token(TkQUESTION)
else
if (ch == '\\')
read_escape
end
@lex_state = EXPR_END
Token(TkINTEGER)
end
end
end
@OP.def_rules("&", "&&", "|", "||") do
|op, io|
@lex_state = EXPR_BEG
Token(op)
end
@OP.def_rules("+=", "-=", "*=", "**=",
"&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
|op, io|
@lex_state = EXPR_BEG
op =~ /^(.*)=$/
Token(TkOPASGN, $1)
end
@OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do
Token(TkUPLUS)
end
@OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do
Token(TkUMINUS)
end
@OP.def_rules("+", "-") do
|op, io|
catch(:RET) do
if @lex_state == EXPR_ARG
if @space_seen and peek(0) =~ /[0-9]/
throw :RET, identify_number
else
@lex_state = EXPR_BEG
end
elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
throw :RET, identify_number
else
@lex_state = EXPR_BEG
end
Token(op)
end
end
@OP.def_rule(".") do
@lex_state = EXPR_BEG
if peek(0) =~ /[0-9]/
ungetc
identify_number
else
# for obj.if
# (JP: obj.if $B$J$I$NBP1~(B)
@lex_state = EXPR_DOT
Token(TkDOT)
end
end
@OP.def_rules("..", "...") do
|op, io|
@lex_state = EXPR_BEG
Token(op)
end
lex_int2
end
def lex_int2
@OP.def_rules("]", "}", ")") do
|op, io|
@lex_state = EXPR_END
@indent -= 1
Token(op)
end
@OP.def_rule(":") do
if @lex_state == EXPR_END || peek(0) =~ /\s/
@lex_state = EXPR_BEG
Token(TkCOLON)
else
@lex_state = EXPR_FNAME;
Token(TkSYMBEG)
end
end
@OP.def_rule("::") do
# p @lex_state.id2name, @space_seen
if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
@lex_state = EXPR_BEG
Token(TkCOLON3)
else
@lex_state = EXPR_DOT
Token(TkCOLON2)
end
end
@OP.def_rule("/") do
|op, io|
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
identify_string(op)
elsif peek(0) == '='
getc
@lex_state = EXPR_BEG
Token(TkOPASGN, :/) #/)
elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
identify_string(op)
else
@lex_state = EXPR_BEG
Token("/") #/)
end
end
@OP.def_rules("^") do
@lex_state = EXPR_BEG
Token("^")
end
# @OP.def_rules("^=") do
# @lex_state = EXPR_BEG
# Token(OP_ASGN, :^)
# end
@OP.def_rules(",", ";") do
|op, io|
@lex_state = EXPR_BEG
Token(op)
end
@OP.def_rule("~") do
@lex_state = EXPR_BEG
Token("~")
end
@OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do
@lex_state = EXPR_BEG
Token("~")
end
@OP.def_rule("(") do
@indent += 1
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
@lex_state = EXPR_BEG
Token(TkfLPAREN)
else
@lex_state = EXPR_BEG
Token(TkLPAREN)
end
end
@OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do
Token("[]")
end
@OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do
Token("[]=")
end
@OP.def_rule("[") do
@indent += 1
if @lex_state == EXPR_FNAME
Token(TkfLBRACK)
else
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
t = Token(TkLBRACK)
elsif @lex_state == EXPR_ARG && @space_seen
t = Token(TkLBRACK)
else
t = Token(TkfLBRACK)
end
@lex_state = EXPR_BEG
t
end
end
@OP.def_rule("{") do
@indent += 1
if @lex_state != EXPR_END && @lex_state != EXPR_ARG
t = Token(TkLBRACE)
else
t = Token(TkfLBRACE)
end
@lex_state = EXPR_BEG
t
end
@OP.def_rule('\\') do
if getc == "\n"
@space_seen = true
@continue = true
Token(TkSPACE)
else
ungetc
Token("\\")
end
end
@OP.def_rule('%') do
|op, io|
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
identify_quotation
elsif peek(0) == '='
getc
Token(TkOPASGN, :%)
elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
identify_quotation
else
@lex_state = EXPR_BEG
Token("%") #))
end
end
@OP.def_rule('$') do
identify_gvar
end
@OP.def_rule('@') do
if peek(0) =~ /[\w_]/
ungetc
identify_identifier
else
Token("@")
end
end
# @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
# |op, io|
# @indent += 1
# @lex_state = EXPR_FNAME
# # @lex_state = EXPR_END
# # until @rests[0] == "\n" or @rests[0] == ";"
# # rests.shift
# # end
# end
@OP.def_rule("") do
|op, io|
printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
if peek(0) =~ /[0-9]/
t = identify_number
elsif peek(0) =~ /[\w_]/
t = identify_identifier
end
printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
t
end
p @OP if RubyLex.debug?
end
def identify_gvar
@lex_state = EXPR_END
case ch = getc
when /[~_*$?!@\/\\;,=:<>".]/ #"
Token(TkGVAR, "$" + ch)
when "-"
Token(TkGVAR, "$-" + getc)
when "&", "`", "'", "+"
Token(TkBACK_REF, "$"+ch)
when /[1-9]/
while getc =~ /[0-9]/; end
ungetc
Token(TkNTH_REF)
when /\w/
ungetc
ungetc
identify_identifier
else
ungetc
Token("$")
end
end
def identify_identifier
token = ""
token.concat getc if peek(0) =~ /[$@]/
while (ch = getc) =~ /\w|_/
print ":", ch, ":" if RubyLex.debug?
token.concat ch
end
ungetc
if ch == "!" or ch == "?"
token.concat getc
end
# almost fix token
# (JP: $BBgBN(Bfix token)
case token
when /^\$/
return Token(TkGVAR, token)
when /^\@/
@lex_state = EXPR_END
return Token(TkIVAR, token)
end
if @lex_state != EXPR_DOT
print token, "\n" if RubyLex.debug?
token_c, *trans = TkReading2Token[token]
if token_c
# reserved word?
# (JP: $BM=Ls8l$+$I$&$+(B?)
if (@lex_state != EXPR_BEG &&
@lex_state != EXPR_FNAME &&
trans[1])
# modifiers
# (JP: $B=$>~;R(B)
token_c = TkSymbol2Token[trans[1]]
@lex_state = trans[0]
else
if @lex_state != EXPR_FNAME
if ENINDENT_CLAUSE.include?(token)
@indent += 1
elsif DEINDENT_CLAUSE.include?(token)
@indent -= 1
end
@lex_state = trans[0]
else
@lex_state = EXPR_END
end
end
return Token(token_c, token)
end
end
if @lex_state == EXPR_FNAME
@lex_state = EXPR_END
if peek(0) == '='
token.concat getc
end
elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
@lex_state = EXPR_ARG
else
@lex_state = EXPR_END
end
if token[0, 1] =~ /[A-Z]/
return Token(TkCONSTANT, token)
elsif token[token.size - 1, 1] =~ /[!?]/
return Token(TkFID, token)
else
return Token(TkIDENTIFIER, token)
end
end
def identify_here_document
ch = getc
# if lt = PERCENT_LTYPE[ch]
if ch == "-"
ch = getc
indent = true
end
if /['"`]/ =~ ch
lt = ch
quoted = ""
while (c = getc) && c != lt
quoted.concat c
end
else
lt = '"'
quoted = ch.dup
while (c = getc) && c =~ /\w/
quoted.concat c
end
ungetc
end
ltback, @ltype = @ltype, lt
reserve = []
while ch = getc
reserve.push ch
if ch == "\\"
reserve.push ch = getc
elsif ch == "\n"
break
end
end
@here_header = false
while (l = gets.chomp) && (indent ? l.strip : l) != quoted
end
@here_header = true
@here_readed.concat reserve
while ch = reserve.pop
ungetc ch
end
@ltype = ltback
@lex_state = EXPR_END
Token(Ltype2Token[lt])
end
def identify_quotation
ch = getc
if lt = PERCENT_LTYPE[ch]
ch = getc
elsif ch =~ /\W/
lt = "\""
else
RubyLex.fail SyntaxError, "unknown type of %string"
end
# if ch !~ /\W/
# ungetc
# next
# end
#@ltype = lt
@quoted = ch unless @quoted = PERCENT_PAREN[ch]
identify_string(lt, @quoted)
end
def identify_number
@lex_state = EXPR_END
if ch = getc
if peek(0) == "x"
ch = getc
match = /[0-9a-f_]/
else
match = /[0-7_]/
end
while ch = getc
if ch !~ match
ungetc
break
end
end
return Token(TkINTEGER)
end
type = TkINTEGER
allow_point = true
allow_e = true
while ch = getc
case ch
when /[0-9_]/
when allow_point && "."
type = TkFLOAT
if peek(0) !~ /[0-9]/
ungetc
break
end
allow_point = false
when allow_e && "e", allow_e && "E"
type = TkFLOAT
if peek(0) =~ /[+-]/
getc
end
allow_e = false
allow_point = false
else
ungetc
break
end
end
Token(type)
end
def identify_string(ltype, quoted = ltype)
@ltype = ltype
@quoted = quoted
subtype = nil
begin
while ch = getc
if @quoted == ch
break
elsif @ltype != "'" && @ltype != "]" and ch == "#"
subtype = true
elsif ch == '\\' #'
read_escape
end
end
if @ltype == "/"
if peek(0) =~ /i|o|n|e|s/
getc
end
end
if subtype
Token(DLtype2Token[ltype])
else
Token(Ltype2Token[ltype])
end
ensure
@ltype = nil
@quoted = nil
@lex_state = EXPR_END
end
end
def identify_comment
@ltype = "#"
while ch = getc
if ch == "\\" #"
read_escape
end
if ch == "\n"
@ltype = nil
ungetc
break
end
end
return Token(TkCOMMENT)
end
def read_escape
case ch = getc
when "\n", "\r", "\f"
when "\\", "n", "t", "r", "f", "v", "a", "e", "b" #"
when /[0-7]/
ungetc ch
3.times do
case ch = getc
when /[0-7]/
when nil
break
else
ungetc
break
end
end
when "x"
2.times do
case ch = getc
when /[0-9a-fA-F]/
when nil
break
else
ungetc
break
end
end
when "M"
if (ch = getc) != '-'
ungetc
else
if (ch = getc) == "\\" #"
read_escape(chrs)
end
end
when "C", "c", "^"
if ch == "C" and (ch = getc) != "-"
ungetc
elsif (ch = getc) == "\\" #"
read_escape(chrs)
end
else
# other characters
#(JP:$B$=$NB>$NJ8;z(B)
end
end
end