Skip to content

Commit

Permalink
[cpp] integer and decimal literals
Browse files Browse the repository at this point in the history
  • Loading branch information
harrand committed May 4, 2024
1 parent 0afd52f commit cf635c2
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 19 deletions.
108 changes: 89 additions & 19 deletions cpp/src/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace lex
unsigned int col = 1;
std::size_t cursor = 0;
std::size_t current_word_begin = npos;
std::size_t current_integer_literal_begin = npos;
std::size_t current_decimal_literal_begin = npos;

void advance(std::size_t amt = 1)
{
Expand All @@ -25,15 +27,50 @@ namespace lex
return this->current_word_begin != npos;
}

std::string pop_word()
bool in_integer_literal() const
{
diag::assert_that(this->in_word(), error_code::ice, "lexer has performed a bad call to pop_word");
std::string ret{source.substr(this->current_word_begin, cursor - this->current_word_begin)};
this->current_word_begin = npos;
return this->current_integer_literal_begin != npos;
}

bool in_decimal_literal() const
{
return this->current_decimal_literal_begin != npos;
}

bool in_anything() const
{
return this->in_word() || this->in_integer_literal() || this->in_decimal_literal();
}

std::string pop_word(type& t)
{
std::string ret;
if(this->in_integer_literal())
{
ret = std::string{source.substr(this->current_integer_literal_begin, cursor - this->current_integer_literal_begin)};
this->current_integer_literal_begin = npos;
t = type::integer_literal;
}
else if(this->in_decimal_literal())
{
ret = std::string{source.substr(this->current_decimal_literal_begin, cursor - this->current_decimal_literal_begin)};
this->current_decimal_literal_begin = npos;
t = type::decimal_literal;
}
else if(this->in_word())
{
ret = std::string{source.substr(this->current_word_begin, cursor - this->current_word_begin)};
this->current_word_begin = npos;
t = type::identifier;
}
else
{
error_generic(this->source.data() + this->cursor, "call to `pop_word`");
}
return ret;
}

void error_generic(std::string_view dodgy_part, std::string msg)
void error_generic(std::string_view dodgy_part, std::string msg, error_code errcode = error_code::syntax)
{
const srcloc curloc
{
Expand All @@ -45,13 +82,18 @@ namespace lex
std::size_t snippet_begin = this->cursor > snippet_width ? (this->cursor - snippet_width) : 0;
std::size_t snippet_end = (this->cursor + snippet_width) >= this->source.size() ? this->source.size() : (this->cursor + snippet_width);
std::string_view snippet = this->source.substr(snippet_begin, snippet_end - snippet_begin);
diag::error(error_code::syntax, "at {}, {}", curloc.to_string(), std::vformat(msg, std::make_format_args(dodgy_part.substr(0, std::min(dodgy_part.size(), static_cast<std::size_t>(2u))), snippet)));
diag::error(errcode, "at {}, {}", curloc.to_string(), std::vformat(msg, std::make_format_args(dodgy_part.substr(0, std::min(dodgy_part.size(), static_cast<std::size_t>(2u))), snippet)));
}

void unrecognised_tokens(std::string_view dodgy_part)
{
this->error_generic(dodgy_part, "unrecognised token(s) \"{}\" within: \"...{}...\"");
}

void invalid_numeric_literal(std::string_view dodgy_part)
{
this->error_generic(dodgy_part, "unexpected non-digit character(s) within numeric literal: \"{}\" within: \"...{}...\"");
}
};

std::optional<token> tokenise_once(tokenise_state& state, std::string_view data);
Expand Down Expand Up @@ -83,12 +125,13 @@ namespace lex
.line = state.line,
.column = state.col
};
if(state.in_word() && breaks_word(data))
if(state.in_anything() && breaks_word(data))
{
std::string word = state.pop_word();
type t;
std::string word = state.pop_word(t);
tokens.push_back
({
.t = type::identifier,
.t = t,
.lexeme = word,
.meta_srcloc = srcloc
{
Expand Down Expand Up @@ -122,17 +165,19 @@ namespace lex
}
if(data.starts_with(";"))
{
return token
{
.t = type::semicolon
};
return token{.t = type::semicolon};
}
else if(data.starts_with(":"))
{
return token
{
.t = type::colon
};
return token{.t = type::colon};
}
else if(data.starts_with("..."))
{
return token{.t = type::ellipsis};
}
else if(data.starts_with("."))
{
return token{.t = type::dot};
}
else if(data.starts_with("->"))
{
Expand Down Expand Up @@ -213,6 +258,14 @@ namespace lex
{
return token{.t = type::operator_asterisk};
}
else if(data.starts_with("?"))
{
return token{.t = type::question_mark};
}
else if(data.starts_with("$"))
{
return token{.t = type::dollar_sign};
}
else if(
// substrings that aren't syntax errors but don't form any tokens.
data.starts_with(" ") ||
Expand All @@ -222,9 +275,26 @@ namespace lex
{}
else if(!breaks_word(data))
{
if(!state.in_word())
if(!state.in_anything())
{
// we're typing something new.
// if the new char is a number, its a numeric literal of some kind.
if(std::isdigit(data.front()))
{
state.current_integer_literal_begin = state.cursor;
}
else // probably the start of a word (identifier/keyword)
{
state.current_word_begin = state.cursor;
}
}
else
{
state.current_word_begin = state.cursor;
if((state.in_integer_literal() || state.in_decimal_literal()) && !std::isdigit(data.front()))
{
// you typed something thats not a digit
state.invalid_numeric_literal(data);
}
}
}
else
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/lex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@ namespace lex
identifier,
semicolon,
colon,
ellipsis,
dot,
arrow_forward,
arrow_backward,
line_comment,
integer_literal,
decimal_literal,
string_literal,
char_literal,
open_paren,
close_paren,
open_brace,
Expand All @@ -29,6 +35,8 @@ namespace lex
operator_minus,
operator_slash,
operator_asterisk,
question_mark,
dollar_sign,
_count,
_undefined,
};
Expand Down
3 changes: 3 additions & 0 deletions samples/scratchpad.psy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ morb :: (xd : i64) -> u0
{
xd2 : i64 = -(xd + xd);
}
123;
45678;
123;

hello my name
is
Expand Down

0 comments on commit cf635c2

Please sign in to comment.