From 378f19f78c8ad83dfb4eb78efc6fb46cbed3a04b Mon Sep 17 00:00:00 2001 From: Jianling Zhong Date: Sat, 30 Nov 2024 19:34:09 -0800 Subject: [PATCH] Support octal, hex, and arbitrary radix numbers --- compiler+runtime/bin/compile | 2 + compiler+runtime/bin/format | 10 + .../include/cpp/jank/read/lex.hpp | 9 +- compiler+runtime/src/cpp/jank/read/lex.cpp | 280 +++++++++++++++--- compiler+runtime/test/cpp/jank/read/lex.cpp | 145 +++++++-- 5 files changed, 384 insertions(+), 62 deletions(-) create mode 100755 compiler+runtime/bin/format diff --git a/compiler+runtime/bin/compile b/compiler+runtime/bin/compile index 3a1037e86..cd9934bda 100755 --- a/compiler+runtime/bin/compile +++ b/compiler+runtime/bin/compile @@ -2,4 +2,6 @@ set -euo pipefail +git_root=$(git rev-parse --show-toplevel) +"$git_root"/compiler+runtime/bin/format cmake --build build "$@" diff --git a/compiler+runtime/bin/format b/compiler+runtime/bin/format new file mode 100755 index 000000000..4171aa7e0 --- /dev/null +++ b/compiler+runtime/bin/format @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -euo pipefail + +git_root=$(git rev-parse --show-toplevel) + +for i in $(git status | grep -E "modified:.*[hc]pp" | sed 's/modified:\s*//'); do + "$git_root"/compiler+runtime/build/llvm-install/usr/local/bin/clang-format -i "$i" + echo "formatted" "$i" +done diff --git a/compiler+runtime/include/cpp/jank/read/lex.hpp b/compiler+runtime/include/cpp/jank/read/lex.hpp index 61b243b8b..ff90b1774 100644 --- a/compiler+runtime/include/cpp/jank/read/lex.hpp +++ b/compiler+runtime/include/cpp/jank/read/lex.hpp @@ -150,14 +150,19 @@ namespace jank::read::lex processor(native_persistent_string_view const &f); result next(); - option peek() const; + option peek(native_integer const ahead = 1) const; option check_whitespace(native_bool const found_space); + native_bool is_valid_num_char(char const c) const; iterator begin(); iterator end(); size_t pos{}; - /* Whether or not the previous token requires a space after it. */ + native_integer radix{ 10 }; + /* The 'r' used in arbitrary radix (prefixed with N and then r, where N is the radix (2 <= radix <= 36); */ + /* e.g. 2r10101 for binary, 16rebed00d for hex) */ + native_bool found_r{}; + /* Whether the previous token requires a space after it. */ native_bool require_space{}; /* True when seeing a '/' following a number. */ native_bool found_slash_after_number{}; diff --git a/compiler+runtime/src/cpp/jank/read/lex.cpp b/compiler+runtime/src/cpp/jank/read/lex.cpp index 557a820c6..74e20bd93 100644 --- a/compiler+runtime/src/cpp/jank/read/lex.cpp +++ b/compiler+runtime/src/cpp/jank/read/lex.cpp @@ -267,6 +267,50 @@ namespace jank::read || c == '<' || c == '>' || c == '#' || c == '%'; } + static native_bool is_lower_letter(char const c) + { + return c >= 'a' && c <= 'z'; + } + + static native_bool is_upper_letter(char const c) + { + return c >= 'A' && c <= 'Z'; + } + + static native_bool is_letter(char const c) + { + return is_lower_letter(c) || is_upper_letter(c); + } + + native_bool processor::is_valid_num_char(char const c) const + { + if(c == '-' || c == '+' || c == '.') + { + return radix == 10; + } + if(radix == 10 && (c == 'e' || c == 'E')) + { + return true; + } + if(radix <= 10) + { + return c >= '0' && c < '0' + radix; + } + if(radix == 16 && (c == 'x' || c == 'X')) + { + return true; + } + if(is_upper_letter(c)) + { + return c < 'A' + radix - 10; + } + if(is_lower_letter(c)) + { + return c < 'a' + radix - 10; + } + return c >= '0' && c <= '9'; + } + result processor::next() { /* Skip whitespace. */ @@ -278,8 +322,7 @@ namespace jank::read return ok(token{ pos, token_kind::eof }); } - auto const c(file[pos]); - if(std::isspace(c) == 0 && c != ',') + if(auto const c(file[pos]); std::isspace(c) == 0 && c != ',') { break; } @@ -288,8 +331,7 @@ namespace jank::read ++pos; } - auto const token_start(pos); - switch(file[token_start]) + switch(auto const token_start(pos); file[token_start]) { case '(': require_space = false; @@ -380,18 +422,72 @@ namespace jank::read } /* Numbers. */ case '-': + { + if(found_r) + { + ++pos; + return err(error{ token_start, pos, "invalid number: '-' after radix" }); + } + } case '0' ... '9': { - auto &&e(check_whitespace(found_space)); - if(e.is_some()) + if(auto &&e(check_whitespace(found_space)); e.is_some()) { return err(std::move(e.unwrap())); } + native_bool contains_leading_digit{ file[token_start] != '-' }; native_bool contains_dot{}; native_bool is_scientific{}; native_bool found_exponent_sign{}; native_bool expecting_exponent{}; + auto r_pos{ pos }; /* records the 'r' position if one is found */ + native_bool found_beginning_negative{}; + + if(file[token_start] == '-' && peek().unwrap_or(' ') == '0' + && !found_slash_after_number) + { + contains_leading_digit = true; + if(auto const f{ peek(2) }; f.is_some()) + { + if(f != 'x' && f != 'X') + { + radix = 8; + } + if(f == 'x' || f == 'X') + { + radix = 16; + } + if(radix == 8) + { + ++pos; + } + if(radix == 16) + { + pos += 2; + } + } + } + else if(file[token_start] == '0' && !found_slash_after_number) + { + contains_leading_digit = true; + if(auto const f{ peek() }; f.is_some()) + { + if(f != 'x' && f != 'X') + { + radix = 8; + } + if(f == 'x' || f == 'X') + { + radix = 16; + } + if(radix == 16) + { + ++pos; + } + } + } + while(true) { auto const oc(peek()); @@ -399,9 +495,7 @@ namespace jank::read { break; } - - auto const c(oc.unwrap()); - if(c == '.') + if(auto const c(oc.unwrap()); c == '.') { if(contains_dot || is_scientific || !contains_leading_digit) { @@ -409,43 +503,96 @@ namespace jank::read return err(error{ token_start, pos, "invalid number" }); } contains_dot = true; + if(radix != 10 && radix != 8) + { + ++pos; + continue; + } + radix = 10; /* numbers like 02.3 should be parsed as decimal numbers. */ } else if(c == 'e' || c == 'E') { - if(is_scientific || !contains_leading_digit) + if(found_r) { ++pos; - return err(error{ token_start, pos, "invalid number" }); + continue; + } + if(radix < 15) + { + /* numbers containing 'e' and radix < 15, then it must be a decimal number. */ + radix = 10; + if(is_scientific || !contains_leading_digit) + { + ++pos; + return err(error{ token_start, pos, "invalid number" }); + } + if(found_slash_after_number) + { + ++pos; + found_slash_after_number = false; + return err(error{ token_start, + pos, + "invalid ratio: ratio cannot have scientific notation" }); + } + is_scientific = true; + expecting_exponent = true; } - is_scientific = true; - expecting_exponent = true; } else if(c == '+' || c == '-') { - if(found_exponent_sign || !is_scientific || !expecting_exponent) + if(radix == 10) { - ++pos; - return err(error{ token_start, pos, "invalid number" }); + if(found_exponent_sign || !is_scientific || !expecting_exponent) + { + ++pos; + return err(error{ token_start, pos, "invalid number" }); + } } found_exponent_sign = true; } + else if(c == 'r' || c == 'R') + { + ++pos; + if(found_r) + { + continue; + } + found_r = true; + r_pos = pos; + if(found_slash_after_number || contains_dot || found_exponent_sign + || expecting_exponent) + { + return err(error{ token_start, + pos, + "invalid number: arbitrary radix number can only integer" }); + } + } else if(c == '/') { require_space = false; ++pos; if(found_exponent_sign || is_scientific || expecting_exponent || contains_dot - || found_slash_after_number) + || found_slash_after_number || (radix != 10 && radix != 8)) { return err(error{ token_start, pos, "invalid ratio" }); } found_slash_after_number = true; + radix = 10; // numbers like 02/3 should be parsed as decimal ratios. /* skip the '/' char and look for the denominator number. */ ++pos; auto const denominator(next()); - if(denominator.is_ok() && denominator.expect_ok().kind == token_kind::integer) + found_slash_after_number = false; + if(denominator.is_ok()) { + if(denominator.expect_ok().kind != token_kind::integer) + { + return err(error{ + token_start, + pos, + "invalid ratio: expecting an integer denominator", + }); + } auto const &denominator_token(denominator.expect_ok()); - found_slash_after_number = false; return ok( token(token_start, pos - token_start, @@ -453,8 +600,7 @@ namespace jank::read { .numerator = std::strtoll(file.data() + token_start, nullptr, 10), .denominator = boost::get(denominator_token.data) })); } - return err( - error{ token_start, pos, "invalid ratio: expecting an integer denominator" }); + return denominator.expect_err(); } else if(std::isdigit(c) == 0) { @@ -464,15 +610,16 @@ namespace jank::read return err( error{ token_start, pos, "unexpected end of real, expecting exponent" }); } - break; + if(!is_letter(c) || (pos == token_start && file[token_start] != '0')) + { + break; + } } else if(expecting_exponent) { expecting_exponent = false; } - contains_leading_digit = true; - ++pos; } @@ -483,26 +630,79 @@ namespace jank::read } /* Tokens beginning with - are ambiguous; it's only a negative number if it has numbers - * to follow. - * TODO: handle numbers starting with `+` */ + * to follow. + * TODO: handle numbers starting with `+` */ if(file[token_start] != '-' || (pos - token_start) >= 1) { require_space = true; ++pos; - if(contains_dot || is_scientific) + auto number_start{ token_start }; + if(file[token_start] == '-' || file[token_start] == '+') { - return ok(token{ token_start, - pos - token_start, - token_kind::real, - std::strtold(file.data() + token_start, nullptr) }); + number_start = token_start + 1; + found_beginning_negative = file[token_start] == '-'; } - else + if(found_r) + { + radix = std::strtoll(file.data() + token_start, nullptr, 10); + if(radix < 0) + { + radix = -radix; + found_beginning_negative = true; + } + if(radix < 2 || radix > 36) + { + return err( + error{ token_start, + pos, + fmt::format("invalid number: radix {} is out of range", radix) }); + } + number_start = r_pos + 1; + } + + /* check for invalid digits */ + native_vector invalid_digits{}; + for(auto i{ number_start }; i < pos; i++) + { + if(!is_valid_num_char(file[i])) + { + invalid_digits.emplace_back(file[i]); + } + } + if(invalid_digits.size() > 0) { + found_r = false; + return err( + error{ token_start, + pos, + fmt::format("invalid number: char {} are invalid for radix {}", + std::string(invalid_digits.begin(), invalid_digits.end()), + radix) }); + } + /* real numbers */ + if(contains_dot || is_scientific || found_exponent_sign) + { + if(radix != 10 || found_r) + { + return err( + error{ token_start, + pos, + fmt::format("invalid number: radix {} number cannot use scientific " + "notation, have '.', or have '+-' inside the number", + radix) }); + } return ok(token{ token_start, pos - token_start, - token_kind::integer, - std::strtoll(file.data() + token_start, nullptr, 10) }); + token_kind::real, + std::strtold(file.data() + token_start, nullptr) }); } + + /* integers */ + auto const parsed_int{ std::strtoll(file.data() + number_start, nullptr, radix) + * (found_beginning_negative ? -1 : 1) }; + radix = 10; + found_r = false; + return ok(token{ token_start, pos - token_start, token_kind::integer, parsed_int }); } /* XXX: Fall through to symbol starting with - */ } @@ -668,8 +868,8 @@ namespace jank::read pos++; /* Unescaped strings can be read right from memory, but escaped strings require - * some processing first, to turn the escape sequences into the necessary characters. - * We use distinct token types for these so we can optimize for the typical case. */ + * some processing first, to turn the escape sequences into the necessary characters. + * We use distinct token types for these so we can optimize for the typical case. */ auto const kind(contains_escape ? token_kind::escaped_string : token_kind::string); return ok(token{ token_start, pos - token_start, @@ -789,14 +989,14 @@ namespace jank::read } } - option processor::peek() const + option processor::peek(native_integer const ahead) const { - auto const next_pos(pos + 1); - if(next_pos >= file.size()) + auto const peek_pos(pos + ahead); + if(peek_pos >= file.size()) { return none; } - return some(file[next_pos]); + return some(file[peek_pos]); } } } diff --git a/compiler+runtime/test/cpp/jank/read/lex.cpp b/compiler+runtime/test/cpp/jank/read/lex.cpp index 31ef2a4a3..022dfda23 100644 --- a/compiler+runtime/test/cpp/jank/read/lex.cpp +++ b/compiler+runtime/test/cpp/jank/read/lex.cpp @@ -407,25 +407,23 @@ namespace jank::read::lex { processor p{ "4//5" }; native_vector> tokens(p.begin(), p.end()); - CHECK( - tokens - == make_results({ { error(0, 4, "invalid ratio: expecting an integer denominator") } })); + CHECK(tokens == make_results({ { error(2, 2, "invalid symbol") } })); } SUBCASE("Failures - x/x/x") { processor p{ "4/5/4" }; native_vector> tokens(p.begin(), p.end()); - CHECK(tokens - == make_results({ { error(0, 3, "invalid ratio: expecting an integer denominator") }, - { error(3, 3, "invalid symbol") } })); + CHECK( + tokens + == make_results({ { error(2, 3, "invalid ratio") }, { error(3, 3, "invalid symbol") } })); } SUBCASE("Failures - x/x/x/x") { processor p{ "4/5/4/5/6/7/7" }; native_vector> tokens(p.begin(), p.end()); - CHECK(tokens - == make_results({ { error(0, 3, "invalid ratio: expecting an integer denominator") }, - { error(3, 3, "invalid symbol") } })); + CHECK( + tokens + == make_results({ { error(2, 3, "invalid ratio") }, { error(3, 3, "invalid symbol") } })); } SUBCASE("Failures - x.x/x") { @@ -443,6 +441,26 @@ namespace jank::read::lex tokens == make_results({ { error(0, 5, "invalid ratio: expecting an integer denominator") } })); } + SUBCASE("Failures - xex/x") + { + processor p{ "4e1/5" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_results({ + error{ 0, 3, "invalid ratio" }, + error{ 3, 3, "invalid symbol" } + })); + } + SUBCASE("Failures - x/xex") + { + processor p{ "4/5e9" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_results({ + error{ 2, 3, "invalid ratio: ratio cannot have scientific notation" }, + token{ 3, 2, token_kind::symbol, "e9"sv } + })); + } } TEST_CASE("Integer") { @@ -466,6 +484,100 @@ namespace jank::read::lex })); } + SUBCASE("Positive multiple numbers") + { + processor p{ "0 1234" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_tokens({ + { 0, 1, token_kind::integer, 0ll }, + { 2, 4, token_kind::integer, 1234ll }, + })); + } + + SUBCASE("Octal number") + { + processor p{ "034 -034 08.9 07e1" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_tokens({ + { 0, 3, token_kind::integer, 28ll }, + { 4, 4, token_kind::integer, -28ll }, + { 9, 4, token_kind::real, 8.9l }, + { 14, 4, token_kind::real, 70.0l }, + })); + } + + SUBCASE("Invalid octal number") + { + processor p{ "08 0a -08" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_results({ + error{ 0, 2, "invalid number: char 8 are invalid for radix 8" }, + error{ 3, 5, "invalid number: char a are invalid for radix 8" }, + error{ 6, 9, "invalid number: char 8 are invalid for radix 8" }, + })); + } + + SUBCASE("Hex numbers") + { + processor p{ "0x34 0Xab 0x12ab 123 0Xef43 -0x1a" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_tokens({ + { 0, 4, token_kind::integer, 52ll }, + { 5, 4, token_kind::integer, 171ll }, + { 10, 6, token_kind::integer, 4779ll }, + { 17, 3, token_kind::integer, 123ll }, + { 21, 6, token_kind::integer, 61251ll }, + { 28, 5, token_kind::integer, -26ll } + })); + } + + SUBCASE("Invalid hex numbers") + { + processor p{ "0xg 0x-2 0x8.4 0x3e-5" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_results({ + error{ 0, 3, "invalid number: char g are invalid for radix 16" }, + error{ 4, 8, "invalid number: char - are invalid for radix 16" }, + error{ 9, 14, "invalid number: char . are invalid for radix 16" }, + error{ 15, 21, "invalid number: char - are invalid for radix 16" }, + })); + } + + SUBCASE("Valid arbitrary radix") + { + processor p{ "2r11 36rz 8R71 19rghi -4r32 16r3e 16r3e4 -32r3e4" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_results({ + token{ 0, 4, token_kind::integer, 3ll }, + token{ 5, 4, token_kind::integer, 35ll }, + token{ 10, 4, token_kind::integer, 57ll }, + token{ 15, 6, token_kind::integer, 6117ll }, + token{ 22, 5, token_kind::integer, -14ll }, + token{ 28, 5, token_kind::integer, 62ll }, + token{ 34, 6, token_kind::integer, 996ll }, + token{ 41, 7, token_kind::integer, -3524ll }, + })); + } + + SUBCASE("Invalid arbitrary radix") + { + processor p{ "2r3 35rz 8re71 19r-ghi" }; + native_vector> tokens(p.begin(), p.end()); + CHECK(tokens + == make_results({ + error{ 0, 3, "invalid number: char 3 are invalid for radix 2" }, + error{ 4, 8, "invalid number: char z are invalid for radix 35" }, + error{ 9, 14, "invalid number: char e are invalid for radix 8" }, + error{ 15, 22, "invalid number: char - are invalid for radix 19" }, + })); + } + SUBCASE("Negative single-char") { processor p{ "-1" }; @@ -494,9 +606,7 @@ namespace jank::read::lex native_vector> tokens(p.begin(), p.end()); CHECK(tokens == make_results({ - token{ 0, 4, token_kind::integer, 1234ll }, - error{ 4, "expected whitespace before next token" }, - token{ 4, 3, token_kind::symbol, "abc"sv }, + error{ 0, 7, "invalid number: char abc are invalid for radix 10" }, })); } @@ -600,7 +710,6 @@ namespace jank::read::lex error{ 3, "unexpected character: ." }, })); } - { processor p{ "0..0" }; native_vector> tokens(p.begin(), p.end()); @@ -631,10 +740,8 @@ namespace jank::read::lex native_vector> tokens(p.begin(), p.end()); CHECK(tokens == make_results({ - token{ 0, 5, token_kind::real, 12.34l }, - error{ 5, "expected whitespace before next token" }, - token{ 5, 3, token_kind::symbol, "abc"sv }, - })); + error(0, 8, "invalid number: char abc are invalid for radix 10"), + })); } SUBCASE("Not required") @@ -724,9 +831,7 @@ namespace jank::read::lex error{ 13, 16, "unexpected end of real, expecting exponent" }, error{ 16, "expected whitespace before next token" }, token{ 16, 3, token_kind::symbol, "Foo"sv }, - token{ 20, 3, token_kind::real, 300000.0l }, - error{ 23, "expected whitespace before next token" }, - token{ 23, 3, token_kind::symbol, "fOo"sv }, + error{ 20, 26, "invalid number: char fOo are invalid for radix 10" }, })); } }