From bc8dc67b8924d841ea8d429a63dff5fa542f7261 Mon Sep 17 00:00:00 2001 From: Rob Durst Date: Wed, 27 Dec 2023 14:56:33 -0700 Subject: [PATCH] ugly first iteration of lexer with tests --- .gitignore | 1 + Gemfile.lock | 3 + spec/spec_helper.rb | 4 + spec/zodiac/character_helpers_spec.rb | 71 ++++++++ spec/zodiac/cli_spec.rb | 1 + spec/zodiac/lexer_spec.rb | 164 ++++++++++++++++++ spec/zodiac/parser_spec.rb | 38 ---- src/zodiac/character_helpers.rb | 24 ++- src/zodiac/lex_error.rb | 6 + src/zodiac/lexer.rb | 154 ++++++++++++++++ src/zodiac/parser.rb | 241 ++++++++++++++++++++------ 11 files changed, 616 insertions(+), 91 deletions(-) create mode 100644 spec/spec_helper.rb create mode 100644 spec/zodiac/lexer_spec.rb delete mode 100644 spec/zodiac/parser_spec.rb create mode 100644 src/zodiac/lex_error.rb create mode 100644 src/zodiac/lexer.rb diff --git a/.gitignore b/.gitignore index e3200e0..10dd6f5 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ build-iPhoneSimulator/ # Used by RuboCop. Remote config files pulled in from inherit_from directive. # .rubocop-https?--* +coverage diff --git a/Gemfile.lock b/Gemfile.lock index 8ed5b1d..1cb0a36 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -69,5 +69,8 @@ DEPENDENCIES rubocop-rspec simplecov +RUBY VERSION + ruby 3.2.2p53 + BUNDLED WITH 2.4.19 diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..d8a8f62 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,4 @@ +# frozen_string_literal: true + +require 'simplecov' +SimpleCov.start diff --git a/spec/zodiac/character_helpers_spec.rb b/spec/zodiac/character_helpers_spec.rb index 695ca01..ae02505 100644 --- a/spec/zodiac/character_helpers_spec.rb +++ b/spec/zodiac/character_helpers_spec.rb @@ -1,10 +1,53 @@ # frozen_string_literal: true +require './spec/spec_helper' require './src/zodiac/character_helpers' describe Zodiac::CharacterHelpers do include described_class + describe '.symbol?' do + context 'when symbol' do + it 'returns true' do + expect(symbol?('.')).to eq(true) + end + end + + context 'when not symbol' do + it 'returns false' do + expect(symbol?('a')).to eq(false) + end + end + end + + describe '.string_start?' do + context 'when string start' do + it 'returns true' do + expect(string_start?('"')).to eq(true) + end + end + + context 'when not string start' do + it 'returns false' do + expect(string_start?('a')).to eq(false) + end + end + end + + describe '.double_symbol?' do + context 'when double symbol' do + it 'returns true' do + expect(double_symbol?('*')).to eq(true) + end + end + + context 'when not double symbol' do + it 'returns false' do + expect(double_symbol?('-')).to eq(false) + end + end + end + describe '.alpha_num?' do context 'when letter' do it 'returns true' do @@ -25,6 +68,20 @@ end end + describe '.contains_equal_sign?' do + context 'when contains equal sign' do + it 'returns true' do + expect(contains_equal_sign?('a=b')).to eq(true) + end + end + + context 'when does not contain equal sign' do + it 'returns false' do + expect(contains_equal_sign?('a')).to eq(false) + end + end + end + describe '.letter?' do context 'when letter' do it 'returns true' do @@ -64,4 +121,18 @@ end end end + + describe '.underscore?' do + context 'when underscore' do + it 'returns true' do + expect(underscore?('_')).to eq(true) + end + end + + context 'when not underscore' do + it 'returns false' do + expect(underscore?('a')).to eq(false) + end + end + end end diff --git a/spec/zodiac/cli_spec.rb b/spec/zodiac/cli_spec.rb index 240fd1f..3abe803 100644 --- a/spec/zodiac/cli_spec.rb +++ b/spec/zodiac/cli_spec.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require './spec/spec_helper' require './src/zodiac/cli' describe Zodiac::CLI do diff --git a/spec/zodiac/lexer_spec.rb b/spec/zodiac/lexer_spec.rb new file mode 100644 index 0000000..abef002 --- /dev/null +++ b/spec/zodiac/lexer_spec.rb @@ -0,0 +1,164 @@ +# frozen_string_literal: true + +require './spec/spec_helper' +require './src/zodiac/lexer' + +describe Zodiac::Lexer do + describe '#lex' do + context 'when empty input' do + it 'returns an empty array of tokens' do + input = '' + lexer = described_class.new(input) + + expected_output = [] + + expect(lexer.lex).to eq(expected_output) + end + end + + context 'when invalid input' do + context 'when fails to lex a string' do + it 'raises an error' do + input = '"hello world' + lexer = described_class.new(input) + expect { lexer.lex }.to raise_error(Zodiac::LexError) + end + end + end + + context 'when happy path' do + it 'lexs symbols' do + input = ':[ ]{}@~$!?:=' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'SYMBOL', value: ':' }, + { kind: 'SYMBOL', value: '[' }, + { kind: 'SYMBOL', value: ']' }, + { kind: 'SYMBOL', value: '{' }, + { kind: 'SYMBOL', value: '}' }, + { kind: 'SYMBOL', value: '@' }, + { kind: 'SYMBOL', value: '~' }, + { kind: 'SYMBOL', value: '$' }, + { kind: 'SYMBOL', value: '!' }, + { kind: 'SYMBOL', value: '?' }, + { kind: 'SYMBOL', value: ':' }, + { kind: 'SYMBOL', value: '=' } + ] + + expect(lexer.lex).to eq(expected_output) + end + + it 'lexs whole words' do + input = 'hello WorLD c4k3 _r3d_foo' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'IDENTIFIER', value: 'hello' }, + { kind: 'IDENTIFIER', value: 'WorLD' }, + { kind: 'IDENTIFIER', value: 'c4k3' }, + { kind: 'IDENTIFIER', value: '_r3d_foo' } + ] + + expect(lexer.lex).to eq(expected_output) + end + + it 'splits words on symbols' do + input = 'hello:world' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'IDENTIFIER', value: 'hello' }, + { kind: 'SYMBOL', value: ':' }, + { kind: 'IDENTIFIER', value: 'world' } + ] + + expect(lexer.lex).to eq(expected_output) + end + + it 'lexs operators' do + input = '+ - * / % ** & | ^ << >> && || @@::..== === =~ +@ -@ [] <=>' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'SYMBOL', value: '+' }, + { kind: 'SYMBOL', value: '-' }, + { kind: 'SYMBOL', value: '*' }, + { kind: 'SYMBOL', value: '/' }, + { kind: 'SYMBOL', value: '%' }, + { kind: 'SYMBOL', value: '**' }, + { kind: 'SYMBOL', value: '&' }, + { kind: 'SYMBOL', value: '|' }, + { kind: 'SYMBOL', value: '^' }, + { kind: 'SYMBOL', value: '<<' }, + { kind: 'SYMBOL', value: '>>' }, + { kind: 'SYMBOL', value: '&&' }, + { kind: 'SYMBOL', value: '||' }, + { kind: 'SYMBOL', value: '@@' }, + { kind: 'SYMBOL', value: '::' }, + { kind: 'SYMBOL', value: '..' }, + { kind: 'SYMBOL', value: '==' }, + { kind: 'SYMBOL', value: '===' }, + { kind: 'SYMBOL', value: '=~' }, + { kind: 'SYMBOL', value: '+@' }, + { kind: 'SYMBOL', value: '-@' }, + { kind: 'SYMBOL', value: '[]' }, + { kind: 'SYMBOL', value: '<=>' } + ] + + expect(lexer.lex).to eq(expected_output) + end + + it 'lexs operators with assignment' do + input = '+= -= *= /= %= **= &= |= ^= <<= >>= &&= ||= []= >= <=' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'OP_ASGN', value: '+=' }, + { kind: 'OP_ASGN', value: '-=' }, + { kind: 'OP_ASGN', value: '*=' }, + { kind: 'OP_ASGN', value: '/=' }, + { kind: 'OP_ASGN', value: '%=' }, + { kind: 'OP_ASGN', value: '**=' }, + { kind: 'OP_ASGN', value: '&=' }, + { kind: 'OP_ASGN', value: '|=' }, + { kind: 'OP_ASGN', value: '^=' }, + { kind: 'OP_ASGN', value: '<<=' }, + { kind: 'OP_ASGN', value: '>>=' }, + { kind: 'OP_ASGN', value: '&&=' }, + { kind: 'OP_ASGN', value: '||=' }, + { kind: 'OP_ASGN', value: '[]=' }, + { kind: 'OP_ASGN', value: '>=' }, + { kind: 'OP_ASGN', value: '<=' } + ] + + expect(lexer.lex).to eq(expected_output) + end + + it 'lexs strings' do + input = '"hello world" \'hello world\' `hello world`' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'STRING', value: '"hello world"' }, + { kind: 'STRING', value: "'hello world'" }, + { kind: 'STRING', value: '`hello world`' } + ] + + expect(lexer.lex).to eq(expected_output) + end + + it 'lexs numbers' do + input = '123 123.456' + lexer = described_class.new(input) + + expected_output = [ + { kind: 'NUMBER', value: '123' }, + { kind: 'NUMBER', value: '123.456' } + ] + + expect(lexer.lex).to eq(expected_output) + end + end + end +end diff --git a/spec/zodiac/parser_spec.rb b/spec/zodiac/parser_spec.rb deleted file mode 100644 index a032b07..0000000 --- a/spec/zodiac/parser_spec.rb +++ /dev/null @@ -1,38 +0,0 @@ -# frozen_string_literal: true - -require './src/zodiac/parser' - -describe Zodiac::Parser do - describe '#parse' do - context 'when empty input' do - it 'returns an empty array of tokens' do - input = '' - parser = described_class.new(input) - - expected_output = [] - - expect(parser.parse).to eq(expected_output) - end - end - - context 'when happy path' do - it 'parses symbols' do - input = ":[]{}\"\"\'\'" - parser = described_class.new(input) - - expected_output = [':', '[', ']', '{', '}', '"', '"', "'", "'"] - - expect(parser.parse).to eq(expected_output) - end - - it 'parses whole word' do - input = 'hello WorLD c4k3' - parser = described_class.new(input) - - expected_output = %w[hello WorLD c4k3] - - expect(parser.parse).to eq(expected_output) - end - end - end -end diff --git a/src/zodiac/character_helpers.rb b/src/zodiac/character_helpers.rb index 9f3a6b7..c0a0288 100644 --- a/src/zodiac/character_helpers.rb +++ b/src/zodiac/character_helpers.rb @@ -3,12 +3,28 @@ module Zodiac # Character helper methods common to parsing within the Zodiac language compiler. module CharacterHelpers + def string_start?(value) + ['"', "'", '`'].include?(value) + end + def symbol?(value) - ".:[]{}\"'".include?(value) + '.:[]{}+-*/%&|^><@~$!?:'.include?(value) + end + + def op_assign_symbol?(value) + '+-*/%*|^><&|[]'.include?(value) + end + + def double_symbol?(value) + '*<>|&@:.'.include?(value) + end + + def contains_equal_sign?(value) + value.include?('=') end def alpha_num?(value) - letter?(value) || number?(value) + letter?(value) || number?(value) || underscore?(value) end def letter?(value) @@ -18,5 +34,9 @@ def letter?(value) def number?(value) value.match?(/[0-9]/) end + + def underscore?(value) + value.match?(/_/) + end end end diff --git a/src/zodiac/lex_error.rb b/src/zodiac/lex_error.rb new file mode 100644 index 0000000..0051596 --- /dev/null +++ b/src/zodiac/lex_error.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +module Zodiac + class LexError < StandardError + end +end diff --git a/src/zodiac/lexer.rb b/src/zodiac/lexer.rb new file mode 100644 index 0000000..106a6e0 --- /dev/null +++ b/src/zodiac/lexer.rb @@ -0,0 +1,154 @@ +# frozen_string_literal: true + +require './src/zodiac/character_helpers' +require './src/zodiac/lex_error' + +module Zodiac + # Base lexing class for the Zodiac language. + # + # Unsupported: + # * FANCIER STRINGS like: `%'(`Q'|`q'|`x')char any_char* char + # * HERE_DOC + # * REGEXP + class Lexer + include ::Zodiac::CharacterHelpers + + def initialize(raw_string) + @raw_string = raw_string + @cur_index = 0 + @tokens = [] + end + + def lex + lex_next while @cur_index < @raw_string.size + + @tokens + end + + private + + def lex_next + @cur = @raw_string[@cur_index] + + # TODO: fix this unclear logic + foo = @raw_string[@cur_index..].index(' ') + end_index = if foo.nil? + @raw_string.size + else + foo + @cur_index + end + + if @cur == '=' + word = '' + if !@raw_string[@cur_index + 1].nil? && @raw_string[@cur_index + 1] == '~' + @cur_index += 2 + @tokens << { kind: 'SYMBOL', value: '=~' } + else + while @cur == '=' + word += @cur + @cur_index += 1 + @cur = @raw_string[@cur_index] + end + @tokens << { kind: 'SYMBOL', value: word } + end + elsif symbol?(@cur) && !@raw_string[@cur_index + 2].nil? && @raw_string[@cur_index..@cur_index + 2] == '<=>' + @tokens << { kind: 'SYMBOL', value: '<=>' } + @cur_index += 3 + elsif contains_equal_sign?(@raw_string[@cur_index..end_index]) && op_assign_symbol?(@cur) && ((end_index - @cur_index) < 4) + lex_op_assign + elsif symbol?(@cur) + if !@raw_string[@cur_index + 1].nil? && @raw_string[@cur_index..@cur_index + 1] == '+@' + @tokens << { kind: 'SYMBOL', value: '+@' } + @cur_index += 2 + elsif !@raw_string[@cur_index + 1].nil? && @raw_string[@cur_index..@cur_index + 1] == '-@' + @tokens << { kind: 'SYMBOL', value: '-@' } + @cur_index += 2 + elsif !@raw_string[@cur_index + 1].nil? && @raw_string[@cur_index..@cur_index + 1] == '[]' + @tokens << { kind: 'SYMBOL', value: '[]' } + @cur_index += 2 + else + lex_symbol + end + elsif letter?(@cur) || underscore?(@cur) + lex_identifier + elsif string_start?(@cur) + lex_string + elsif number?(@cur) + lex_number + else + @cur_index += 1 + end + end + + def lex_symbol + if @cur == @raw_string[@cur_index + 1] && double_symbol?(@raw_string[@cur_index + 1]) + @tokens << { kind: 'SYMBOL', value: @cur + @raw_string[@cur_index + 1] } + @cur_index += 2 + else + @tokens << { kind: 'SYMBOL', value: @cur } + @cur_index += 1 + end + end + + # OP_ASGN : `+=' | `-=' | `*=' | `/=' | `%=' | `**=' + # | `&=' | `|=' | `^=' | `<<=' | `>>=' + # | `&&=' | `||=' | '[]=' + def lex_op_assign + end_index = @raw_string[@cur_index..].index('=') + @cur_index + @tokens << { kind: 'OP_ASGN', value: @raw_string[@cur_index..end_index] } + @cur_index = end_index + 1 + end + + # STRING : `"' any_char* `"' + # | `'' any_char* `'' + # | ``' any_char* ``' + def lex_string + rest_of_string = @raw_string[@cur_index + 1..] + raise LexError, 'String not terminated' unless rest_of_string.include?(@cur) + + end_index = @raw_string[@cur_index + 1..].index(@cur) + @cur_index + 1 + @tokens << { kind: 'STRING', value: @raw_string[@cur_index..end_index] } + @cur_index = end_index + 1 + end + + # NUMBER : `0' | (`1'..'9') (`0'..'9')* + # | decimal_digit decimal_digit* (`.' decimal_digit decimal_digit*)? + def lex_number + word = lex_single_number + + if @cur == '.' + word += @cur + @cur_index += 1 + @cur = @raw_string[@cur_index] + word += lex_single_number + end + + @tokens << { kind: 'NUMBER', value: word } + end + + def lex_single_number + word = '' + + while (@cur_index < @raw_string.size) && number?(@cur) + word += @cur + @cur_index += 1 + @cur = @raw_string[@cur_index] + end + + word + end + + # IDENTIFIER is the sqeunce of characters in the pattern of /[a-zA-Z_][a-zA-Z0-9_]*/. + def lex_identifier + word = '' + + while (@cur_index < @raw_string.size) && alpha_num?(@cur) + word += @cur + @cur_index += 1 + @cur = @raw_string[@cur_index] + end + + @tokens << { kind: 'IDENTIFIER', value: word } + end + end +end diff --git a/src/zodiac/parser.rb b/src/zodiac/parser.rb index e4fcee8..98b2f2a 100644 --- a/src/zodiac/parser.rb +++ b/src/zodiac/parser.rb @@ -1,53 +1,192 @@ # frozen_string_literal: true +# Here is the syntax of Ruby in pseudo BNF. For more detail, see parse.y in Ruby distribution. -require './src/zodiac/character_helpers' - -module Zodiac - # Base parsing class for the Zodiac language. - class Parser - include ::Zodiac::CharacterHelpers - - def initialize(raw_string) - @raw_string = raw_string - @cur_index = 0 - @tokens = [] - end - - def parse - parse_next while @cur_index < @raw_string.size - - @tokens - end - - private - - def parse_next - @cur = @raw_string[@cur_index] - - if symbol?(@cur) - parse_symbol - elsif letter?(@cur) - parse_word - else - @cur_index += 1 - end - end - - def parse_symbol - @tokens << @cur - @cur_index += 1 - end - - def parse_word - word = '' - - while (@cur_index < @raw_string.size) && alpha_num?(@cur) - word += @cur - @cur_index += 1 - @cur = @raw_string[@cur_index] - end - - @tokens << word - end - end -end +# PROGRAM : COMPSTMT + +# COMPSTMT : STMT (TERM EXPR)* [TERM] + +# STMT : CALL do [`|' [BLOCK_VAR] `|'] COMPSTMT end +# | undef FNAME +# | alias FNAME FNAME +# | STMT if EXPR +# | STMT while EXPR +# | STMT unless EXPR +# | STMT until EXPR +# | `BEGIN' `{' COMPSTMT `}' +# | `END' `{' COMPSTMT `}' +# | LHS `=' COMMAND [do [`|' [BLOCK_VAR] `|'] COMPSTMT end] +# | EXPR + +# EXPR : MLHS `=' MRHS +# | return CALL_ARGS +# | yield CALL_ARGS +# | EXPR and EXPR +# | EXPR or EXPR +# | not EXPR +# | COMMAND +# | `!' COMMAND +# | ARG + +# CALL : FUNCTION +# | COMMAND + +# COMMAND : OPERATION CALL_ARGS +# | PRIMARY `.' OPERATION CALL_ARGS +# | PRIMARY `::' OPERATION CALL_ARGS +# | super CALL_ARGS + +# FUNCTION : OPERATION [`(' [CALL_ARGS] `)'] +# | PRIMARY `.' OPERATION `(' [CALL_ARGS] `)' +# | PRIMARY `::' OPERATION `(' [CALL_ARGS] `)' +# | PRIMARY `.' OPERATION +# | PRIMARY `::' OPERATION +# | super `(' [CALL_ARGS] `)' +# | super + +# ARG : LHS `=' ARG +# | LHS OP_ASGN ARG +# | ARG `..' ARG +# | ARG `...' ARG +# | ARG `+' ARG +# | ARG `-' ARG +# | ARG `*' ARG +# | ARG `/' ARG +# | ARG `%' ARG +# | ARG `**' ARG +# | `+' ARG +# | `-' ARG +# | ARG `|' ARG +# | ARG `^' ARG +# | ARG `&' ARG +# | ARG `<=>' ARG +# | ARG `>' ARG +# | ARG `>=' ARG +# | ARG `<' ARG +# | ARG `<=' ARG +# | ARG `==' ARG +# | ARG `===' ARG +# | ARG `!=' ARG +# | ARG `=~' ARG +# | ARG `!~' ARG +# | `!' ARG +# | `~' ARG +# | ARG `<<' ARG +# | ARG `>>' ARG +# | ARG `&&' ARG +# | ARG `||' ARG +# | defined? ARG +# | PRIMARY + +# PRIMARY : `(' COMPSTMT `)' +# | LITERAL +# | VARIABLE +# | PRIMARY `::' IDENTIFIER +# | `::' IDENTIFIER +# | PRIMARY `[' [ARGS] `]' +# | `[' [ARGS [`,']] `]' +# | `{' [(ARGS|ASSOCS) [`,']] `}' +# | return [`(' [CALL_ARGS] `)'] +# | yield [`(' [CALL_ARGS] `)'] +# | defined? `(' ARG `)' +# | FUNCTION +# | FUNCTION `{' [`|' [BLOCK_VAR] `|'] COMPSTMT `}' +# | if EXPR THEN +# COMPSTMT +# (elsif EXPR THEN COMPSTMT)* +# [else COMPSTMT] +# end +# | unless EXPR THEN +# COMPSTMT +# [else COMPSTMT] +# end +# | while EXPR DO COMPSTMT end +# | until EXPR DO COMPSTMT end +# | case COMPSTMT +# (when WHEN_ARGS THEN COMPSTMT)+ +# [else COMPSTMT] +# end +# | for BLOCK_VAR in EXPR DO +# COMPSTMT +# end +# | begin +# COMPSTMT +# [rescue [ARGS] DO COMPSTMT]+ +# [else COMPSTMT] +# [ensure COMPSTMT] +# end +# | class IDENTIFIER [`<' IDENTIFIER] +# COMPSTMT +# end +# | module IDENTIFIER +# COMPSTMT +# end +# | def FNAME ARGDECL +# COMPSTMT +# end +# | def SINGLETON (`.'|`::') FNAME ARGDECL +# COMPSTMT +# end + +# WHEN_ARGS : ARGS [`,' `*' ARG] +# | `*' ARG + +# THEN : TERM +# | then +# | TERM then + +# DO : TERM +# | do +# | TERM do + +# BLOCK_VAR : LHS +# | MLHS + +# MLHS : MLHS_ITEM `,' [MLHS_ITEM (`,' MLHS_ITEM)*] [`*' [LHS]] +# | `*' LHS + +# MLHS_ITEM : LHS +# | '(' MLHS ')' + +# LHS : VARIABLE +# | PRIMARY `[' [ARGS] `]' +# | PRIMARY `.' IDENTIFIER + +# MRHS : ARGS [`,' `*' ARG] +# | `*' ARG + +# CALL_ARGS : ARGS +# | ARGS [`,' ASSOCS] [`,' `*' ARG] [`,' `&' ARG] +# | ASSOCS [`,' `*' ARG] [`,' `&' ARG] +# | `*' ARG [`,' `&' ARG] +# | `&' ARG +# | COMMAND + +# ARGS : ARG (`,' ARG)* + +# ARGDECL : `(' ARGLIST `)' +# | ARGLIST TERM + +# ARGLIST : IDENTIFIER(`,'IDENTIFIER)*[`,'`*'[IDENTIFIER]][`,'`&'IDENTIFIER] +# | `*'IDENTIFIER[`,'`&'IDENTIFIER] +# | [`&'IDENTIFIER] + +# SINGLETON : VARIABLE +# | `(' EXPR `)' + +# ASSOCS : ASSOC (`,' ASSOC)* + +# ASSOC : ARG `=>' ARG + +# VARIABLE : VARNAME +# | nil +# | self + +# LITERAL : numeric +# | SYMBOL +# | STRING +# | STRING2 +# | HERE_DOC +# | REGEXP + +# TERM : `;' +# | `\n'