From 3fb3931142da8f4513f6b2d6be153827b4fb6f15 Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Thu, 9 May 2024 22:04:55 -0400 Subject: [PATCH] refactor(scanner): use new array header for stack --- .github/workflows/ci.yml | 3 - src/scanner.c | 184 +++++++++++++++++++++++++++++---------- src/stack.h | 90 ------------------- test/test-stack.c | 46 ---------- 4 files changed, 137 insertions(+), 186 deletions(-) delete mode 100644 src/stack.h delete mode 100644 test/test-stack.c diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2566d2c..d2555ad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,9 +72,6 @@ jobs: with: node-version: 20 - - name: Test C stack code - run: gcc test/test-stack.c -o a.out && ./a.out - - name: Generate parser from scratch and test it if: ${{ runner.os == 'Linux' || needs.changedfiles.outputs.c }} shell: bash diff --git a/src/scanner.c b/src/scanner.c index fac9bd0..48e6ff9 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -1,9 +1,17 @@ -#include "stack.h" +#include "tree_sitter/alloc.h" +#include "tree_sitter/array.h" #include "tree_sitter/parser.h" -#include -#include + #include +// #define DEBUG + +#ifdef DEBUG +#define LOG(...) fprintf(stderr, __VA_ARGS__) +#else +#define LOG(...) +#endif + enum TokenType { AUTOMATIC_SEMICOLON, INDENT, @@ -22,26 +30,82 @@ enum TokenType { WITH, }; +typedef struct { + Array(int16_t) indents; + int16_t last_indentation_size; + int16_t last_newline_count; + int16_t last_column; +} Scanner; + void *tree_sitter_scala_external_scanner_create() { - return createStack(); + Scanner *scanner = ts_calloc(1, sizeof(Scanner)); + array_init(&scanner->indents); + scanner->last_indentation_size = -1; + scanner->last_column = -1; + return scanner; } void tree_sitter_scala_external_scanner_destroy(void *payload) { - free(payload); + Scanner *scanner = payload; + array_delete(&scanner->indents); + ts_free(scanner); } unsigned tree_sitter_scala_external_scanner_serialize(void *payload, char *buffer) { - return serialiseStack(payload, buffer); + Scanner *scanner = (Scanner*)payload; + + if ((scanner->indents.size + 3) * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + return 0; + } + + size_t size = 0; + *(int16_t *)&buffer[size] = scanner->last_indentation_size; + size += sizeof(int16_t); + *(int16_t *)&buffer[size] = scanner->last_newline_count; + size += sizeof(int16_t); + *(int16_t *)&buffer[size] = scanner->last_column; + size += sizeof(int16_t); + + for (unsigned i = 0; i < scanner->indents.size; i++) { + *(int16_t *)&buffer[size] = scanner->indents.contents[i]; + size += sizeof(int16_t); + } + + return size; } void tree_sitter_scala_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { - deserialiseStack(payload, buffer, length); + Scanner *scanner = (Scanner*)payload; + array_clear(&scanner->indents); + scanner->last_indentation_size = -1; + scanner->last_column = -1; + scanner->last_newline_count = 0; + + if (length == 0) { + return; + } + + size_t size = 0; + + scanner->last_indentation_size = *(int16_t *)&buffer[size]; + size += sizeof(int16_t); + scanner->last_newline_count = *(int16_t *)&buffer[size]; + size += sizeof(int16_t); + scanner->last_column = *(int16_t *)&buffer[size]; + size += sizeof(int16_t); + + while (size < length) { + array_push(&scanner->indents, *(int16_t *)&buffer[size]); + size += sizeof(int16_t); + } + + assert(size == length); } -static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } -static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } static bool scan_string_content(TSLexer *lexer, bool is_multiline, bool has_interpolation) { unsigned closing_quote_count = 0; @@ -102,7 +166,7 @@ static bool detect_comment_start(TSLexer *lexer) { } static bool scan_word(TSLexer *lexer, const char* const word) { - for (int i = 0; word[i] != '\0'; i++) { + for (uint8_t i = 0; word[i] != '\0'; i++) { if (lexer->lookahead != word[i]) { return false; } @@ -111,12 +175,20 @@ static bool scan_word(TSLexer *lexer, const char* const word) { return !iswalnum(lexer->lookahead); } +static inline void debug_indents(Scanner *scanner) { + LOG(" indents(%d): ", scanner->indents.size); + for (unsigned i = 0; i < scanner->indents.size; i++) { + LOG("%d ", scanner->indents.contents[i]); + } + LOG("\n"); +} + bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { - ScannerStack *stack = (ScannerStack *)payload; - int prev = peekStack(stack); - int newline_count = 0; - int indentation_size = 0; + Scanner *scanner = (Scanner *)payload; + int16_t prev = scanner->indents.size > 0 ? *array_back(&scanner->indents) : -1; + int16_t newline_count = 0; + int16_t indentation_size = 0; while (iswspace(lexer->lookahead)) { if (lexer->lookahead == '\n') { @@ -130,35 +202,47 @@ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, } // Before advancing the lexer, check if we can double outdent - if (valid_symbols[OUTDENT] && - (lexer->lookahead == 0 || + if ( + valid_symbols[OUTDENT] && ( - (prev != -1) && - lexer->lookahead == ')' || - lexer->lookahead == ']' || - lexer->lookahead == '}' - ) || ( - stack->last_indentation_size != -1 && - prev != -1 && - stack->last_indentation_size < prev))) { - popStack(stack); + lexer->lookahead == 0 || + ( + prev != -1 && + ( + lexer->lookahead == ')' || + lexer->lookahead == ']' || + lexer->lookahead == '}' + ) + ) || + ( + scanner->last_indentation_size != -1 && + prev != -1 && + scanner->last_indentation_size < prev + ) + ) + ) { + if (scanner->indents.size > 0) { + array_pop(&scanner->indents); + } LOG(" pop\n"); LOG(" OUTDENT\n"); lexer->result_symbol = OUTDENT; return true; } - stack->last_indentation_size = -1; - - printStack(stack, " before"); + scanner->last_indentation_size = -1; - if (valid_symbols[INDENT] && + if ( + valid_symbols[INDENT] && newline_count > 0 && - (isEmptyStack(stack) || - indentation_size > peekStack(stack))) { + ( + scanner->indents.size == 0 || + indentation_size > *array_back(&scanner->indents) + ) + ) { if (detect_comment_start(lexer)) { return false; } - pushStack(stack, indentation_size); + array_push(&scanner->indents, indentation_size); lexer->result_symbol = INDENT; LOG(" INDENT\n"); return true; @@ -167,11 +251,17 @@ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, // This saves the indentation_size and newline_count so it can be used // in subsequent calls for multiple outdent or autosemicolon. if (valid_symbols[OUTDENT] && - (lexer->lookahead == 0 || ( + (lexer->lookahead == 0 || + ( newline_count > 0 && prev != -1 && - indentation_size < prev))) { - popStack(stack); + indentation_size < prev + ) + ) + ) { + if (scanner->indents.size > 0) { + array_pop(&scanner->indents); + } LOG(" pop\n"); LOG(" OUTDENT\n"); lexer->result_symbol = OUTDENT; @@ -179,26 +269,26 @@ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, if (detect_comment_start(lexer)) { return false; } - stack->last_indentation_size = indentation_size; - stack->last_newline_count = newline_count; + scanner->last_indentation_size = indentation_size; + scanner->last_newline_count = newline_count; if (lexer->eof(lexer)) { - stack->last_column = -1; + scanner->last_column = -1; } else { - stack->last_column = (int)lexer->get_column(lexer); + scanner->last_column = (int16_t)lexer->get_column(lexer); } return true; } // Recover newline_count from the outdent reset bool is_eof = lexer->eof(lexer); - if (stack->last_newline_count > 0 && - ((is_eof && stack->last_column == -1) || - (!is_eof && lexer->get_column(lexer) == stack->last_column))) { - newline_count += stack->last_newline_count; + if ( + scanner->last_newline_count > 0 && + (is_eof && scanner->last_column == -1) || + (!is_eof && lexer->get_column(lexer) == (uint32_t)scanner->last_column) + ) { + newline_count += scanner->last_newline_count; } - stack->last_newline_count = 0; - - printStack(stack, " after"); + scanner->last_newline_count = 0; if (valid_symbols[AUTOMATIC_SEMICOLON] && newline_count > 0) { // AUTOMATIC_SEMICOLON should not be issued in the middle of expressions @@ -240,7 +330,7 @@ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, } skip(lexer); } - // If some code is present at the same line after comment end, + // If some code is present at the same line after comment end, // we should still produce AUTOMATIC_SEMICOLON, e.g. in // val a = 1 // /* comment */ val b = 2 diff --git a/src/stack.h b/src/stack.h deleted file mode 100644 index edd4c36..0000000 --- a/src/stack.h +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include -#include - -#ifdef DEBUG -#define LOG(...) fprintf(stderr, __VA_ARGS__) -#else -#define LOG(...) -#endif - -// Total payload size is 1024 bytes max -#define STACK_SIZE 100 - -typedef struct ScannerStack { - int stack[STACK_SIZE]; - int top; - int last_indentation_size; - int last_newline_count; - int last_column; -} ScannerStack; - -static ScannerStack* createStack() { - ScannerStack* ptr = (ScannerStack*) malloc(sizeof(ScannerStack)); - - ptr -> top = 0; - ptr -> last_indentation_size = -1; - ptr -> last_newline_count = 0; - ptr -> last_column = -1; - memset(ptr -> stack, STACK_SIZE, (0)); - - return ptr; -} - -static bool isEmptyStack(ScannerStack *stack) { return stack->top == 0; } - -static int peekStack(ScannerStack *stack) { - return isEmptyStack(stack) ? -1 : stack->stack[stack->top - 1]; -} - -static void pushStack(ScannerStack *stack, unsigned int value) { - stack->top++; - stack->stack[stack->top - 1] = (int)value; -} - -static int popStack(ScannerStack *stack) { - if (isEmptyStack(stack)) { - return -1; - } - int result = peekStack(stack); - stack->top--; - - return result; -} - -static void printStack(ScannerStack *stack, char *msg) { - LOG("%s Stack[top = %d; ", msg, stack->top); - for (int i = 0; i < stack->top; i++) { - LOG("%d | ", stack->stack[i]); - } - LOG("]\n"); -} - -static unsigned serialiseStack(ScannerStack *stack, char *buf) { - int elements = isEmptyStack(stack) ? 0 : stack->top; - if (elements < 0) { - elements = 0; - } - unsigned result_length = (elements + 3) * sizeof(int); - int *placement = (int *)buf; - memcpy(placement, stack->stack, elements * sizeof(int)); - placement[elements] = stack->last_indentation_size; - placement[elements + 1] = stack->last_newline_count; - placement[elements + 2] = stack->last_column; - - return result_length; -} - -static void deserialiseStack(ScannerStack* stack, const char* buf, unsigned length) { - if (length != 0) { - int *intBuf = (int *)buf; - - unsigned elements = length / sizeof(int) - 3; - stack->top = (int)elements; - memcpy(stack->stack, intBuf, elements * sizeof(int)); - stack->last_indentation_size = intBuf[elements]; - stack->last_newline_count = intBuf[elements + 1]; - stack->last_column = intBuf[elements + 2]; - } -} diff --git a/test/test-stack.c b/test/test-stack.c deleted file mode 100644 index 60c1e41..0000000 --- a/test/test-stack.c +++ /dev/null @@ -1,46 +0,0 @@ -#define DEBUG -#include "../src/stack.h" -#include -#include -#include - -int main() { - ScannerStack *stack = createStack(); - - printStack(stack, "hello"); - - assert(isEmptyStack(stack)); - - pushStack(stack, 27); - assert(!isEmptyStack(stack)); - assert(peekStack(stack) == 27); - - pushStack(stack, 42); - assert(!isEmptyStack(stack)); - assert(peekStack(stack) == 42); - - assert(popStack(stack) == 42); - assert(peekStack(stack) == 27); - - assert(popStack(stack) == 27); - assert(peekStack(stack) == -1); - assert(isEmptyStack(stack)); - - char *buf = malloc(1024); - - for (int i = 0; i < 100; i++) { - pushStack(stack, i); - } - - assert(serialiseStack(stack, buf) == sizeof(int) * 103); - - ScannerStack *newStack = createStack(); - - deserialiseStack(newStack, buf, sizeof(int) * 103); - assert(newStack -> top == 100); - assert(popStack(newStack) == 99); - - printStack(stack, "hello"); - printStack(newStack, "hello"); - return 0; -}