-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprims.ml
410 lines (380 loc) · 17.5 KB
/
prims.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
(*
This module contains code to generate the low-level implementation of
parts of the standard library procedures.
The module works by defining a hierarchy of templates, which call each other
to form complete routines. See the inline comments below for more information
on the templates and the individual routines.
Note that the implementation below contain no error handling or correctness-checking
of any kind. This is because we will not test your compilers on invalid input.
However, adding correctness-checking and error handling *as general templates* would be
rather simple.
*)
module type PRIMS = sig
val procs : string;;
end
module Prims : PRIMS = struct
(* This is the most basic routine template. It takes a body and label name, and
creates the standard x86-64 bit routine form.
All other templates and routine-generation functions depend on this template. *)
let make_routine label body =
label ^ ":
push rbp
mov rbp, rsp
" ^ body ^ "
pop rbp
ret";;
(* Many of the low-level stdlib procedures are predicate procedures, which perform
some kind of comparison, and then return one of the constants sob_true or sob_false.
Since this pattern repeats so often, we have a template that takes a body, and a type
of condition to test for jump, and generates an assembly snippet that evaluated the body,
and return true or false, depending on the type of condition. *)
let return_boolean jcc body =
body ^ "
" ^ jcc ^ " .true
mov rax, SOB_FALSE_ADDRESS
jmp .return
.true:
mov rax, SOB_TRUE_ADDRESS
.return:";;
(*
Many of the predicates just test some kind of equality (or, equivalently, if the
zero flag is set), so this is an auxiliary function dedicated to equality-testing predicates.
Note how we make use of currying here.
*)
let return_boolean_eq = return_boolean "je";;
(*
Almost all of the stdlib function take 1 or more arguments. Since all of the variadic procedures
are implemented in the high-level scheme library code (found in stdlib.scm), we only have to deal
with 1,2 or 3 arguments.
These helper functions inject instructions to get parameter values off the stack and into registers
to work with.
The argument register assignment follows the x86 64bit Unix ABI, because there needs to be *some*
kind of consistency, so why not just use the standard ABI.
See page 22 in https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
*)
let make_unary label body = make_routine label ("mov rsi, PVAR(0)\n\t" ^ body);;
let make_binary label body = make_unary label ("mov rdi, PVAR(1)\n\t" ^ body);;
let make_tertiary label body = make_binary label ("mov rdx, PVAR(2)\n\t" ^ body);;
(* All of the type queries in scheme (e.g., null?, pair?, char?, etc.) are equality predicates
that are implemented by comparing the first byte pointed to by PVAR(0) to the relevant type tag.
so the only unique bits of each of these predicates are the name of the routine (i.e., the label),
and the type tag we expect to find.
The implementation of the type-queries generator is slightly more complex, since a template and a label
name aren't enough: we need to generate a routine for every (label * type_tag) pair (e.g., the routine
`is_boolean` should test for the T_BOOL type tag).
We have a list of pairs, associating each predicate label with the correct type tag, and map the templating
function over this list. Note that the query template function makes use of some of the other templating
functions defined above: `make_unary` (predicates take only one argument) and `return_boolean_eq` (since
these are equality-testing predicates).
*)
let type_queries =
let queries_to_types = [
"boolean", "T_BOOL"; "flonum", "T_FLOAT"; "rational", "T_RATIONAL"; "pair", "T_PAIR";
"null", "T_NIL"; "char", "T_CHAR"; "string", "T_STRING"; "symbol", "T_SYMBOL";
"procedure", "T_CLOSURE"
] in
let single_query name type_tag =
make_unary (name ^ "?")
(return_boolean_eq ("mov sil, byte [rsi]\n\tcmp sil, " ^ type_tag)) in
String.concat "\n\n" (List.map (fun (a, b) -> single_query a b) queries_to_types);;
(* The arithmetic operation implementation is multi-tiered:
- The low-level implementations of all operations are binary, e.g. (+ 1 2 3) and (+ 1) are not
supported in the low-level implementation.
- The low-level implementations only support same-type operations, e.g. (+ 1 2.5) is not supported
in the low-level implementation. This means each operation has two low-level implementations, one
for floating-point operands, and one for fractional operands.
- Each pair of low-level operation implementations is wrapped by a dispatcher which decides which
of the two implementations to call (by probing the types of the operands).
- The high-level implementations (see stdlib.scm) make use of a high-level dispatcher, that is in charge
of performing type conversions as necessary to satisfy the pre-conditions of the low-level implementations.
Operations on floating-point operands:
-------------------------------------
The implementations of binary floating point arithmetic operations contain almost identical code. The
differences are the name (label) of the routines, and the arithmetic instruction applied to
the two arguments. Other than that, they are all the same: binary routines which load the values
pointed at by PVAR(0) and PVAR(1) into SSE2 registers, compute the operation, create a new sob_float
on the heap with the result, and store the address of the sob_float in rax as the return value.
This allows us to easily abstract this code into a template that requires a label name and its matching
arithmetic instruction (which are paired up in the op_map).
Operations on fractional operands:
----------------------------------
The addition and multiplication operations on rational numbers are similar to each other: both load 2 arguments,
both deconstruct the arguments into numerator and denominator, both allocate a sob_rational to store the result
on the heap, and both move the address of this sob_rational into rax as the return value. The only differences
are the routine name (label), and the implementation of the arithmetic operation itself.
This allows us to easily abstract this code into a template that requires a label name and its matching
arithmetic instruction (which are paired up in the op_map).
Unlike in the case of floating point arithmetic, rational division is treated differently, and is implemented by
using the identity (a/b) / (c/d) == (a/b) * (d/c).
This is done by inverting the second arg (in PVAR(1)) and tail-calling fraction multiplication (`jmp mul`).
Comparators:
------------
While the implementation of the Comparators is slightly more complex, since they make use of `return_boolean`,
the idea is the same as the arithmetic operators.
A couple of things to note:
- `eq.flt` can collapse to a bitwise comparison (like in the case of integers in C), while `eq.rat` involves
comparing the numerator and denominator separately, due to our fraction representation using 128 bits
and not 64 bits.
- `lt.flt` does not handle NaN, +inf and -inf correctly. This allows us to use `return_boolean jl` for both the
floating-point and the fraction cases. For a fully correct implementation, `lt.flt` should make use of
`return_boolean jb` instead (see https://www.felixcloutier.com/x86/ucomisd for more information).
*)
let numeric_ops =
let numeric_op name flt_body rat_body body_wrapper =
make_binary name
(body_wrapper
("mov dl, byte [rsi]
cmp dl, T_FLOAT
jne ." ^ name ^ "_rat
" ^ flt_body ^ "
jmp .op_return
." ^ name ^ "_rat:
" ^ rat_body ^ "
.op_return:")) in
let arith_map = [
"MAKE_RATIONAL(rax, rdx, rdi)
mov PVAR(1), rax
pop rbp
jmp mul", "divsd", "div";
"imul rsi, rdi
imul rcx, rdx", "mulsd", "mul";
"imul rsi, rdx
imul rdi, rcx
add rsi, rdi
imul rcx, rdx", "addsd", "add";
] in
let arith name flt_op rat_op =
numeric_op name
("FLOAT_VAL rsi, rsi
movq xmm0, rsi
FLOAT_VAL rdi, rdi
movq xmm1, rdi
" ^ flt_op ^ " xmm0, xmm1
movq rsi, xmm0
MAKE_FLOAT(rax, rsi)")
("DENOMINATOR rcx, rsi
DENOMINATOR rdx, rdi
NUMERATOR rsi, rsi
NUMERATOR rdi, rdi
" ^ rat_op ^ "
MAKE_RATIONAL(rax, rsi, rcx)") in
let comp_map = [
(* = *)
return_boolean_eq,
"NUMERATOR rcx, rsi
NUMERATOR rdx, rdi
cmp rcx, rdx
jne .false
DENOMINATOR rcx, rsi
DENOMINATOR rdx, rdi
cmp rcx, rdx
.false:",
"FLOAT_VAL rsi, rsi
FLOAT_VAL rdi, rdi
cmp rsi, rdi", "eq";
(* < *)
return_boolean "jl",
"DENOMINATOR rcx, rsi
DENOMINATOR rdx, rdi
NUMERATOR rsi, rsi
NUMERATOR rdi, rdi
imul rsi, rdx
imul rdi, rcx
cmp rsi, rdi",
"FLOAT_VAL rsi, rsi
movq xmm0, rsi
FLOAT_VAL rdi, rdi
movq xmm1, rdi
ucomisd xmm0, xmm1", "lt";
] in
let comparator comp_wrapper name flt_body rat_body = numeric_op name flt_body rat_body comp_wrapper in
(String.concat "\n\n" (List.map (fun (a, b, c) -> arith c b a (fun x -> x)) arith_map)) ^
"\n\n" ^
(String.concat "\n\n" (List.map (fun (a, b, c, d) -> comparator a d c b) comp_map));;
(* The following set of operations contain fewer similarities, to the degree that it doesn't seem that
creating more fine-grained templates for them is beneficial. However, since they all make use of
some of the other templates, it is beneficial to organize them in a structure that enables
a uniform mapping operation to join them all into the final string.*)
let misc_ops =
let misc_parts = [
(* string ops *)
"STRING_LENGTH rsi, rsi
MAKE_RATIONAL(rax, rsi, 1)", make_unary, "string_length";
"STRING_ELEMENTS rsi, rsi
NUMERATOR rdi, rdi
add rsi, rdi
mov sil, byte [rsi]
MAKE_CHAR(rax, sil)", make_binary, "string_ref";
"STRING_ELEMENTS rsi, rsi
NUMERATOR rdi, rdi
add rsi, rdi
CHAR_VAL rax, rdx
mov byte [rsi], al
mov rax, SOB_VOID_ADDRESS", make_tertiary, "string_set";
"NUMERATOR rsi, rsi
CHAR_VAL rdi, rdi
and rdi, 255
MAKE_STRING rax, rsi, dil", make_binary, "make_string";
"SYMBOL_VAL rsi, rsi
STRING_LENGTH rcx, rsi
STRING_ELEMENTS rdi, rsi
push rcx
push rdi
mov dil, byte [rdi]
MAKE_CHAR(rax, dil)
push rax
MAKE_RATIONAL(rax, rcx, 1)
push rax
push 2
push SOB_NIL_ADDRESS
call make_string
add rsp, 4*8
STRING_ELEMENTS rsi, rax
pop rdi
pop rcx
cmp rcx, 0
je .end
.loop:
lea r8, [rdi+rcx]
lea r9, [rsi+rcx]
mov bl, byte [r8]
mov byte [r9], bl
loop .loop
.end:", make_unary, "symbol_to_string";
(* the identity predicate (i.e., address equality) *)
(return_boolean_eq "cmp rsi, rdi"), make_binary, "eq?";
(* type conversions *)
"CHAR_VAL rsi, rsi
and rsi, 255
MAKE_RATIONAL(rax, rsi, 1)", make_unary, "char_to_integer";
"NUMERATOR rsi, rsi
and rsi, 255
MAKE_CHAR(rax, sil)", make_unary, "integer_to_char";
"DENOMINATOR rdi, rsi
NUMERATOR rsi, rsi
cvtsi2sd xmm0, rsi
cvtsi2sd xmm1, rdi
divsd xmm0, xmm1
movq rsi, xmm0
MAKE_FLOAT(rax, rsi)", make_unary, "exact_to_inexact";
"NUMERATOR rsi, rsi
mov rdi, 1
MAKE_RATIONAL(rax, rsi, rdi)", make_unary, "numerator";
"DENOMINATOR rsi, rsi
mov rdi, 1
MAKE_RATIONAL(rax, rsi, rdi)", make_unary, "denominator";
(* GCD *)
"xor rdx, rdx
NUMERATOR rax, rsi
NUMERATOR rdi, rdi
.loop:
and rdi, rdi
jz .end_loop
xor rdx, rdx
div rdi
mov rax, rdi
mov rdi, rdx
jmp .loop
.end_loop:
mov rdx, rax
MAKE_RATIONAL(rax, rdx, 1)", make_binary, "gcd";
"CAR rax, rsi", make_unary, "car";
"CDR rax, rsi", make_unary, "cdr";
"mov qword [rsi+TYPE_SIZE], rdi\n mov rax, SOB_VOID_ADDRESS", make_binary, "setcar";
"mov qword [rsi+TYPE_SIZE+WORD_SIZE], rdi\n mov rax, SOB_VOID_ADDRESS", make_binary, "setcdr";
"MAKE_PAIR(rax, rsi, rdi)", make_binary, "cons";
] in
String.concat "\n\n" (List.map (fun (a, b, c) -> (b c a)) misc_parts);;
(* in the frame that came in here there is:
arg n = list
arg n-1
...
arg2
arg1 = proc
n
env
ret add
old rbp*)
let apply_op = "apply:\n" ^
"push rbp\n" ^
"mov rbp, rsp\n" ^
"mov r8, 0 ; counter of arguments in list\n" ^
"mov r9, qword [rbp + 8*3] ; r9 have the number of args\n" ^
"dec r9 ; we want the position of the last arg (the list)\n" ^
"mov rbx, PVAR(r9)\n" ^
"; now we going to push the list args to the stack\n" ^
"apply_push_list:\n" ^
"cmp rbx, SOB_NIL_ADDRESS\n" ^
"je apply_end_of_push_list\n" ^
"CAR rdx, rbx ; rdx = car(lst)\n" ^
"CDR rbx, rbx ; rbx = cdr(lst)\n" ^
"push rdx\n" ^
"inc r8\n" ^
"jmp apply_push_list\n" ^
"apply_end_of_push_list:\n" ^
"; we need to swap so the last arg will be the uppest\n" ^
"mov r9, 0\n" ^
"mov r10, r8 ; the amount of args in list\n" ^
"dec r10 ; starting from n-1\n" ^
"; swap stack[r9] with stack[r10] using r11 and r12 (cause cannot mov mem to mem)\n" ^
"apply_swap_list_args:\n" ^
"cmp r9, r10\n" ^
"jge apply_end_of_swap_list_args\n" ^
"mov r11, qword [rsp + 8*r9]\n" ^
"mov r12, qword [rsp + 8*r10]\n" ^
"mov qword [rsp + 8*r9], r12\n" ^
"mov qword [rsp + 8*r10], r11\n" ^
"inc r9\n" ^
"dec r10\n" ^
"jmp apply_swap_list_args\n" ^
"apply_end_of_swap_list_args:\n" ^
"; now we going to push all of the rest args, from the end to the beginning\n" ^
"mov r9, qword [rbp + 8*3] ; r9 have the number of args, rsp still point the stack when calls\n" ^
"sub r9, 2 ; 1 for list and 1 for proc\n" ^
"mov r10, r9 ; save the number of args\n" ^
"apply_push_args:\n" ^
"cmp r9, 0\n" ^
"je apply_end_of_push_args\n" ^
"push PVAR(r9) ; starting from the end, dont want to push the proc in PVAR(0)\n" ^
"dec r9\n" ^
"jmp apply_push_args\n" ^
"apply_end_of_push_args:\n" ^
"; we should push the rest of frame (n, env, return address and old rbp)\n" ^
"; r10 have the number of arguments not in list, and r8 have the number in list\n" ^
"mov r9, r10 ; r9 will hold the new n\n" ^
"add r9, r8\n" ^
"mov rbx, r9 ; save the new n\n" ^
"push r9 ; push new n\n" ^
"; to push the env we need to take it out from closure\n" ^
"mov rax, qword [rbp + 8*4] ; rax = closure of proc\n" ^
"CLOSURE_ENV r9, rax\n" ^
"push r9 ; push new env\n" ^
"push qword [rbp + 8*1] ; push return address\n" ^
"push qword [rbp] ; push old rsp\n" ^
"mov rdx, qword [rbp] ; save old rbp\n" ^
"add r10, 6 ; r10 holds the number of args in old frame, adding 2 for proc and list and 4 for the rest frame\n" ^
"; r10 = old frame size\n" ^
"mov r9, rbx ; r9 is the new n\n" ^
"add r9, 4 ; r9 = size of new frame\n" ^
"mov r11, r10 ; save r10\n" ^
"; shift the stack upward\n" ^
(* "SHIFT_FRAME r10\n" ^ *)
"mov rcx, r9 ; rcx is the counter\n" ^
"apply_shift_stack:\n" ^
"; doing the shift using r8\n" ^
"; rcx is the pointer to the new frame, and r10 is the pointer to new position\n" ^
"dec r10\n" ^
"dec rcx\n" ^
"mov r8, qword [rsp + 8*rcx]\n" ^
"mov qword [rbp + 8*r10], r8\n" ^
"cmp rcx, 0 ; r9 times times\n" ^
"jne apply_shift_stack\n" ^
"shl r11, 3 ; r11 is the real size of shift\n" ^
"add rsp, r11 ; fix rsp\n" ^
"pop rbp ; restore old rbp\n" ^
"jmp qword [rax + 1 + 8]; jmp to code in tail call, 1 for type and 8 for env"
(* This is the interface of the module. It constructs a large x86 64-bit string using the routines
defined above. The main compiler pipline code (in compiler.ml) calls into this module to get the
string of primitive procedures. *)
let procs = String.concat "\n\n" [type_queries ; numeric_ops; misc_ops; apply_op];;
end;;