-
Notifications
You must be signed in to change notification settings - Fork 7
/
tokenize.rkt
72 lines (64 loc) · 2.74 KB
/
tokenize.rkt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
;; The tokenizer pass
#lang racket/base
(provide tokenize
(struct-out token))
(require racket/match
racket/list
racket/string
syntax-color/module-lexer)
(struct token (srcloc text type) #:transparent)
;; tokenize :: string? natural-number/c any/c -> (listof token?)
(define (tokenize program-source max-blank-lines source)
(define max-newlines (add1 max-blank-lines))
(define p (open-input-string program-source source))
(port-count-lines! p)
(let loop ([mode #f])
(define start-srcloc (call-with-values (λ () (port-next-location p)) list))
(match-define-values (text type paren-type start-pos end-pos _ new-mode)
#;(module-lexer* p 0 mode)
(module-lexer p 0 mode))
(cond
[(eof-object? text) '()]
[else
(define srcloc
(list (first start-srcloc) (second start-srcloc) (third start-srcloc) (- end-pos start-pos)))
(define current
(cond
[(eq? type 'parenthesis) (token srcloc text `(parenthesis ,paren-type))]
[(eq? type 'white-space)
(define num-newlines (sub1 (length (string-split text "\n" #:trim? #f))))
(token srcloc
""
`(white-space ,(cond
[(> num-newlines max-newlines) max-newlines]
[else num-newlines])))]
[(eq? type 'sexp-comment)
;; we need to re-read because sexp-comment's text is always #;
;; but when it appears before #lang, we want to read its content
;; and treat it as a block comment.
(define re-read (substring program-source (sub1 start-pos) (sub1 end-pos)))
(cond
[(equal? text re-read) (token srcloc text 'sexp-comment)]
;; this is sexp comment before #lang, treat it as a block comment
[else (token srcloc re-read 'block-comment)])]
;; non-comment
[(not (eq? type 'comment)) (token srcloc text type)]
;; non-empty regular line comment
[(non-empty-string? text)
;; we need to re-read due to #31
(define re-read (substring program-source (sub1 start-pos) (sub1 end-pos)))
(token srcloc re-read 'line-comment)]
;; empty regular line comment
[(= end-pos (add1 start-pos)) (token srcloc ";" 'line-comment)]
;; block comment
[else
;; we need to re-read because block comment's content is always empty
(token srcloc
(substring program-source (sub1 start-pos) (sub1 end-pos))
'block-comment)]))
(cons current (loop new-mode))])))
(module+ main
(tokenize "#lang racket/base
a
#;#;(abc) def
qwe" 1 #f))