Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 38 additions & 112 deletions lib/prism/lex_compat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -196,57 +196,6 @@ def deconstruct_keys(keys)
"__END__": :on___end__
}.freeze

# When we produce tokens, we produce the same arrays that Ripper does.
# However, we add a couple of convenience methods onto them to make them a
# little easier to work with. We delegate all other methods to the array.
class Token < BasicObject
# Create a new token object with the given ripper-compatible array.
def initialize(array)
@array = array
end

# The location of the token in the source.
def location
@array[0]
end

# The type of the token.
def event
@array[1]
end

# The slice of the source that this token represents.
def value
@array[2]
end

# The state of the lexer when this token was produced.
def state
@array[3]
end

# We want to pretend that this is just an Array.
def ==(other) # :nodoc:
@array == other
end

def respond_to_missing?(name, include_private = false) # :nodoc:
@array.respond_to?(name, include_private)
end

def method_missing(name, ...) # :nodoc:
@array.send(name, ...)
end
end

# Tokens where state should be ignored
# used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
class IgnoreStateToken < Token
def ==(other) # :nodoc:
self[0...-1] == other[0...-1]
end
end

# A heredoc in this case is a list of tokens that belong to the body of the
# heredoc that should be appended onto the list of tokens when the heredoc
# closes.
Expand Down Expand Up @@ -290,7 +239,7 @@ def to_a
embexpr_balance = 0

tokens.each_with_object([]) do |token, results| #$ Array[Token]
case token.event
case token[1]
when :on_embexpr_beg
embexpr_balance += 1
results << token
Expand All @@ -305,9 +254,9 @@ def to_a
if split
# Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
# to keep the delimiter in the result.
token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
token[2].split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
column = 0 if index > 0
results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
results << [[lineno, column], :on_tstring_content, value, token[3]]
lineno += value.count("\n")
end
else
Expand Down Expand Up @@ -350,15 +299,15 @@ def initialize
# whitespace on plain string content tokens. This allows us to later
# remove that amount of whitespace from the beginning of each line.
def <<(token)
case token.event
case token[1]
when :on_embexpr_beg, :on_heredoc_beg
@embexpr_balance += 1
@dedent = 0 if @dedent_next && @ended_on_newline
when :on_embexpr_end, :on_heredoc_end
@embexpr_balance -= 1
when :on_tstring_content
if embexpr_balance == 0
line = token.value
line = token[2]

if dedent_next && !(line.strip.empty? && line.end_with?("\n"))
leading = line[/\A(\s*)\n?/, 1]
Expand All @@ -381,7 +330,7 @@ def <<(token)
end
end

@dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
@dedent_next = token[1] == :on_tstring_content && embexpr_balance == 0
@ended_on_newline = false
tokens << token
end
Expand All @@ -394,7 +343,7 @@ def to_a
embexpr_balance = 0

tokens.each do |token|
case token.event
case token[1]
when :on_embexpr_beg, :on_heredoc_beg
embexpr_balance += 1
results << token
Expand All @@ -406,9 +355,9 @@ def to_a
lineno = token[0][0]
column = token[0][1]

token.value.split(/(?<=\n)/).each_with_index do |value, index|
token[2].split(/(?<=\n)/).each_with_index do |value, index|
column = 0 if index > 0
results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
results << [[lineno, column], :on_tstring_content, value, token[3]]
lineno += 1
end
else
Expand Down Expand Up @@ -436,15 +385,15 @@ def to_a
results << token
index += 1

case token.event
case token[1]
when :on_embexpr_beg, :on_heredoc_beg
embexpr_balance += 1
when :on_embexpr_end, :on_heredoc_end
embexpr_balance -= 1
when :on_tstring_content
if embexpr_balance == 0
while index < max_index && tokens[index].event == :on_tstring_content && !token.value.match?(/\\\r?\n\z/)
token.value << tokens[index].value
while index < max_index && tokens[index][1] == :on_tstring_content && !token[2].match?(/\\\r?\n\z/)
token[2] << tokens[index][2]
index += 1
end
end
Expand All @@ -467,7 +416,7 @@ def to_a
# whitespace calculation we performed above. This is because
# checking if the subsequent token needs to be dedented is common to
# both the dedent calculation and the ignored_sp insertion.
case token.event
case token[1]
when :on_embexpr_beg
embexpr_balance += 1
results << token
Expand All @@ -479,7 +428,7 @@ def to_a
# Here we're going to split the string on newlines, but maintain
# the newlines in the resulting array. We'll do that with a look
# behind assertion.
splits = token.value.split(/(?<=\n)/)
splits = token[2].split(/(?<=\n)/)
index = 0

while index < splits.length
Expand Down Expand Up @@ -536,12 +485,12 @@ def to_a
ignored = deleted_chars.join
line.delete_prefix!(ignored)

results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
results << [[lineno, 0], :on_ignored_sp, ignored, token[3]]
column = ignored.length
end
end

results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
results << [[lineno, column], token[1], line, token[3]] unless line.empty?
index += 1
end
else
Expand All @@ -552,7 +501,7 @@ def to_a
end

dedent_next =
((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
((token[1] == :on_tstring_content) || (token[1] == :on_heredoc_end)) &&
embexpr_balance == 0
end

Expand All @@ -563,11 +512,11 @@ def to_a
# Here we will split between the two types of heredocs and return the
# object that will store their tokens.
def self.build(opening)
case opening.value[2]
case opening[2][2]
when "~"
DedentingHeredoc.new
when "-"
DashHeredoc.new(opening.value[3] != "'")
DashHeredoc.new(opening[2][3] != "'")
else
PlainHeredoc.new
end
Expand Down Expand Up @@ -647,24 +596,24 @@ def result
# Ripper doesn't include the rest of the token in the event, so we need to
# trim it down to just the content on the first line.
value = value[0..value.index("\n")]
Token.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
when :on_comment
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
when :on_heredoc_end
# Heredoc end tokens can be emitted in an odd order, so we don't
# want to bother comparing the state on them.
last_heredoc_end = token.location.end_offset
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
when :on_embexpr_end
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
when :on_words_sep
# Ripper emits one token each per line.
value.each_line.with_index do |line, index|
if index > 0
lineno += 1
column = 0
end
tokens << Token.new([[lineno, column], event, line, lex_state])
tokens << [[lineno, column], event, line, lex_state]
end
tokens.pop
when :on_regexp_end
Expand Down Expand Up @@ -696,7 +645,7 @@ def result
previous_state
end

Token.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
when :on_eof
eof_token = token
previous_token = result_value[index - 1][0]
Expand All @@ -721,13 +670,13 @@ def result
end_offset += 3
end

tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
tokens << [[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]
end
end

Token.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
else
Token.new([[lineno, column], event, value, lex_state])
[[lineno, column], event, value, lex_state]
end

previous_state = lex_state
Expand Down Expand Up @@ -813,9 +762,8 @@ def result
tokens = tokens[0...-1]

# We sort by location because Ripper.lex sorts.
# Manually implemented instead of `sort_by!(&:location)` for performance.
tokens.sort_by! do |token|
line, column = token.location
line, column = token[0]
source.byte_offset(line, column)
end

Expand All @@ -834,7 +782,7 @@ def insert_on_sp(tokens, source, data_loc, bom, eof_token)
prev_token_end = bom ? 3 : 0

tokens.each do |token|
line, column = token.location
line, column = token[0]
start_offset = source.byte_offset(line, column)

# Ripper reports columns on line 1 without counting the BOM, so we
Expand All @@ -858,50 +806,28 @@ def insert_on_sp(tokens, source, data_loc, bom, eof_token)
continuation = sp_value[continuation_index...next_whitespace_index]
second_whitespace = sp_value[next_whitespace_index..]

new_tokens << IgnoreStateToken.new([
[sp_line, sp_column],
:on_sp,
first_whitespace,
prev_token_state
]) unless first_whitespace.empty?

new_tokens << IgnoreStateToken.new([
[sp_line, sp_column + continuation_index],
:on_sp,
continuation,
prev_token_state
])

new_tokens << IgnoreStateToken.new([
[sp_line + 1, 0],
:on_sp,
second_whitespace,
prev_token_state
]) unless second_whitespace.empty?
new_tokens << [[sp_line, sp_column], :on_sp, first_whitespace, prev_token_state] unless first_whitespace.empty?
new_tokens << [[sp_line, sp_column + continuation_index], :on_sp, continuation, prev_token_state]
new_tokens << [[sp_line + 1, 0], :on_sp, second_whitespace, prev_token_state] unless second_whitespace.empty?
else
new_tokens << IgnoreStateToken.new([
[sp_line, sp_column],
:on_sp,
sp_value,
prev_token_state
])
new_tokens << [[sp_line, sp_column], :on_sp, sp_value, prev_token_state]
end
end

new_tokens << token
prev_token_state = token.state
prev_token_end = start_offset + token.value.bytesize
prev_token_state = token[3]
prev_token_end = start_offset + token[2].bytesize
end

unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
end_offset = eof_token.location.end_offset
if prev_token_end < end_offset
new_tokens << IgnoreStateToken.new([
new_tokens << [
[source.line(prev_token_end), source.column(prev_token_end)],
:on_sp,
source.slice(prev_token_end, end_offset - prev_token_end),
prev_token_state
])
]
end
end

Expand Down
2 changes: 1 addition & 1 deletion lib/prism/translation/ripper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def self.lex(src, filename = "-", lineno = 1, raise_errors: false)
# # => ["def", " ", "m", "(", "a", ")", " ", "nil", " ", "end"]
#
def self.tokenize(...)
lex(...).map(&:value)
lex(...).map { |token| token[2] }
end

# This contains a table of all of the parser events and their
Expand Down
19 changes: 18 additions & 1 deletion rakelib/lex.rake
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ module Prism
end

result = Prism.lex_compat(source)
if result.errors.empty? && Ripper.lex(source) == result.value
if result.errors.empty? && compare_lex(Ripper.lex(source), result.value)
@passing_file_count += 1
true
else
Expand All @@ -54,6 +54,23 @@ module Prism
PERCENT=#{(passing_file_count.to_f / (passing_file_count + failing_file_count) * 100).round(2)}%
RESULTS
end

private

def compare_lex(ripper, prism)
[ripper.length, prism.length].max.times do |index|
ripper_token = ripper[index]
prism_token = prism[index]

# There are some tokens that have slightly different state that do not
# effect the parse tree, so they may not match.
if ripper_token && prism_token && ripper_token[1] == prism_token[1] && %i[on_comment on_heredoc_end on_embexpr_end on_sp].include?(ripper_token[1])
ripper_token[3] = prism_token[3] = nil
end
end

ripper == prism
end
end

class << self
Expand Down
Loading