-
Notifications
You must be signed in to change notification settings - Fork 6
/
mmix-pipe.w
6840 lines (6093 loc) · 263 KB
/
mmix-pipe.w
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% This file is part of the MMIXware package (c) Donald E Knuth 1999
@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
\def\title{MMIX-PIPE}
\def\MMIX{\.{MMIX}}
\def\NNIX{\hbox{\mc NNIX}}
\def\Hex#1{\hbox{$^{\scriptscriptstyle\#}$\tt#1}} % experimental hex constant
@s and normal @q unreserve a C++ keyword @>
@s or normal @q unreserve a C++ keyword @>
@s bool normal @q unreserve a C++ keyword @>
@s xor normal @q unreserve a C++ keyword @>
@* Introduction. This program is the heart of the meta-simulator for the
ultra-configurable \MMIX\ pipeline: It defines the |MMIX_run| routine, which
does most of the
work. Another routine, |MMIX_init|, is also defined here, and so is a header
file called \.{mmix-pipe.h}. The header file is used by the main routine and
by other routines like |MMIX_config|, which are compiled separately.
Readers of this program should be familiar with the explanation of \MMIX\
architecture as presented in the main program module for {\mc MMMIX}.
A lot of subtle things can happen when instructions are executed in parallel.
Therefore this simulator ranks among the most interesting and instructive
programs in the author's experience. The author has tried his best to make
everything correct \dots\ but the chances for error are great. Anyone who
discovers a bug is therefore urged to report it as soon as possible;
please see \.{http:/\kern-.1em/mmix.cs.hm.edu/bugs/} for instructions.
It sort of boggles the mind when one realizes that the present program might
someday be translated by a \CEE/~compiler for \MMIX\ and used to simulate
{\it itself}.
@ This high-performance prototype of \MMIX\ achieves its efficiency by
means of ``pipelining,'' a technique of overlapping that is explained
for the related \.{DLX} computer in Chapter~3 of Hennessy \char`\&\ Patterson's
@^Hennessy, John LeRoy@>
@^Patterson, David Andrew@>
book {\sl Computer Architecture\/} (second edition). Other techniques
such as ``dynamic scheduling'' and ``multiple issue,'' explained in
Chapter~4 of that book, are used too.
One good way to visualize the procedure is to imagine that somebody has
organized a high-tech car repair shop according to similar principles.
There are eight independent functional units, which we can think of as
eight groups of auto mechanics, each specializing in a particular task;
each group has its own workspace with room to deal with one car at a time.
Group~F (the ``fetch'' group) is in charge of rounding up customers and
getting them to enter the assembly-line garage in an orderly fashion.
Group~D (the ``decode and dispatch'' group) does the initial vehicle
inspection and
writes up an order that explains what kind of servicing is required.
The vehicles go next to one of the four ``execution'' groups:
Group~X handles routine maintenance, while groups XF, XM, and XD are
specialists in more complex tasks that tend to take longer. (The XF
people are good at floating the points, while the XM and XD groups are
experts in multilink suspensions and differentials.) When the relevant X~group
has finished its work, cars drive to M~station, where they send or receive
messages and possibly pay money to members of the ``memory'' group. Finally
all necessary parts are installed by members of group~W, the ``write''
group, and the car leaves the shop. Everything is tightly organized so
that in most cases the cars move in synchronized fashion from station
to station, at regular 100-nanocentury intervals. % about 5.3 minutes
In a similar way, most \MMIX\ instructions can be handled in a five-stage
pipeline, F--D--X--M--W, with X replaced by XF for floating-point
addition or conversion, or by XM for multiplication, or by XD for
division or square root. Each stage ideally takes one clock cycle,
although XF, XM, and (especially) XD are slower. If the instructions enter
in a suitable pattern, we might see one instruction being fetched,
another being decoded, and up to four being executed, while another is accessing
memory, and yet another is finishing up by writing new information into
registers; all this is going on simultaneously during one clock cycle. Pipelining
with eight separate stages might therefore make the machine run
up to 8 times as fast as it could if each instruction were being dealt with
individually and without overlap. (Well, perfect speedup turns out to
be impossible, because of the shared M and~W stages; the theory of
knapsack programming, to be discussed in Section~7.7 of {\sl The Art
of Computer Programming}, tells us that the maximal achievable speedup is
at most $8-1/p-1/q-1/r$ when XF, XM, and~XD have delays bounded by $p$,
$q$, and~$r$ cycles. But we can achieve a factor of more than~7
if we are very lucky.)
Consider, for example, the \.{ADD} instruction. This instruction enters
the computer's processing unit in F stage, taking only one clock cycle if
it is in the cache of instructions recently seen. Then the D~stage
recognizes the command as an \.{ADD} and acquires the current values
of \$Y and \$Z; meanwhile, of course, another instruction is being fetched
by~F.
On the next clock cycle, the X stage adds the values together.
This prepares the way for the M stage to watch for overflow and to
get ready for any exceptional action that might be needed with respect
to the settings of special register~rA\null.
Finally, on the fifth clock cycle, the sum is either written into~\$X
or the trip handler for integer overflow is invoked.
Although this process has taken five clock
cycles (that is, $5\upsilon$),
the net increase in running time has been only~$1\upsilon$.
Of course congestion can occur, inside a computer as in a repair shop.
For example, auto parts might not be readily available; or a car might
have to sit in D station while waiting to move to XM, thereby blocking
somebody else from moving from F to~D. Sometimes there won't
necessarily be a steady stream of customers. In such cases the
employees in some parts of the shop will occasionally be idle. But we
assume that they always do their jobs as fast as possible, given the
sequence of customers that they encounter. With a clever person
setting up appointments---translation: with a clever
programmer and/or compiler arranging \MMIX\ instructions---the
organization can often be expected to run at nearly peak capacity.
In fact, this program is designed for experiments with many kinds of
pipelines, potentially using additional functional units (such as
several independent X~groups), and potentially fetching, dispatching, and
executing several nonconflicting instructions simultaneously.
Such complications
make this program more difficult than a simple pipeline simulator
would be, but they also make it a lot more instructive because we
can get a better understanding of the issues involved if we are
required to treat them in greater generality.
@ Here's the overall structure of the present program module.
@c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "abstime.h"
@h@#
@<Header definitions@>@;
@<Type definitions@>@;
@<Global variables@>@;
@<External variables@>@;
@<Internal prototypes@>@;
@<External prototypes@>@;
@<Subroutines@>@;
@<External routines@>@;
@ The identifier \&{Extern} is used in {\mc MMIX-PIPE} to
declare variables that are accessed in other modules. Actually
all appearances of `\&{Extern}' are defined to be blank here, but
`\&{Extern}' will become `\&{extern}' in the header file.
@d Extern /* blank for us, \&{extern} for them */
@f Extern extern
@<External variables@>=
Extern int verbose; /* controls the level of diagnostic output */
@ The header file repeats the basic definitions and declarations.
@(mmix-pipe.h@>=
#define Extern extern
@<Header definitions@>@;
@<Type definitions@>@;
@<External variables@>@;
@<External prototypes@>@;
@ Subroutines of this program are declared first with a prototype,
as in {\mc ANSI C}, then with an old-style \CEE/ function definition.
The following preprocessor commands make this work correctly with both
new-style and old-style compilers.
@^prototypes for functions@>
@<Header def...@>=
#ifdef __STDC__
#define ARGS(list) list
#else
#define ARGS(list) ()
#endif
@ Some of the names that are natural for this program are in
conflict with library names on at least
one of the host computers in the author's tests. So we
bypass the library names here.
@<Header def...@>=
#define random my_random
#define fsqrt my_fsqrt
#define div my_div
@ The amount of verbosity depends on the following bit codes.
@<Header def...@>=
#define issue_bit (1<<0)
/* show control blocks when issued, deissued, committed */
#define pipe_bit (1<<1)
/* show the pipeline and locks on every cycle */
#define coroutine_bit (1<<2)
/* show the coroutines when started on every cycle */
#define schedule_bit (1<<3)
/* show the coroutines when scheduled */
#define uninit_mem_bit (1<<4)
/* complain when reading from an uninitialized chunk of memory */
#define interactive_read_bit (1<<5)
/* prompt user when reading from I/O location */
#define show_spec_bit (1<<6)
/* display special read/write transactions as they happen */
#define show_pred_bit (1<<7)
/* display branch prediction details */
#define show_wholecache_bit (1<<8)
/* display cache blocks even when their key tag is invalid */
@ The |MMIX_init()| routine should be called exactly once, after
|MMIX_config()| has done its work but before the simulator starts to execute
any programs. Then |MMIX_run()| can be called as often as the user likes.
The |MMIX_silent()| routine is a noninteractive variant of |MMIX_run()|:
It will return the value of register |g[255].l| when executing a
\.{TRAP} \.{0,Halt,0} instruction.
@s octa int
@<External proto...@>=
Extern void MMIX_init @,@,@[ARGS((void))@];
Extern void MMIX_run @,@,@[ARGS((int cycs, octa breakpoint))@];
Extern int MMIX_silent @,@,@[ARGS((void))@];
@ @<External routines@>=
void MMIX_init()
{
register int i,j;
@<Initialize everything@>;
}
@#
int MMIX_silent()
{
octa breakpoint;
@<Local variables@>;
while (true) {
@<Perform one machine cycle@>;
if (halted) return specval(&g[255]).o.l;
}
}
@#
void MMIX_run(cycs,breakpoint)
int cycs;
octa breakpoint;
{
@<Local variables@>;
while (cycs) {
if (verbose&(issue_bit|pipe_bit|coroutine_bit|schedule_bit))
printf("*** Cycle %d\n", ticks.l);
@<Perform one machine cycle@>;
if (verbose&pipe_bit) {
print_pipe();@+ print_locks();
}
if (breakpoint_hit||halted) {
if (breakpoint_hit)
printf("Breakpoint instruction fetched at time %d\n",ticks.l-1);
if (halted) printf("Halted at time %d\n", ticks.l-1);
break;
}
cycs--;
}
cease:;
}
@ @<Type...@>=
typedef enum {@!false, @!true, @!wow}@+bool; /* slightly extended booleans */
@ @<Local var...@>=
register int i,j,m;
bool breakpoint_hit=false;
bool halted=false;
@ Error messages that abort this program are called panic messages.
The macro called |confusion| will never be needed unless this program is
internally inconsistent.
@d errprint0(f) fprintf(stderr,f)
@d errprint1(f,a) fprintf(stderr,f,a)
@d errprint2(f,a,b) fprintf(stderr,f,a,b)
@d panic(x)@+ {@+errprint0("Panic: ");@+x;@+errprint0("!\n");@+expire();@+}
@d confusion(m) errprint1("This can't happen: %s",m)
@.This can't happen@>
@<Internal proto...@>=
static void expire @,@,@[ARGS((void))@];
@ @<Sub...@>=
static void expire() /* the last gasp before dying */
{
if (ticks.h) errprint2("(Clock time is %dH+%d.)\n",ticks.h,ticks.l);
else errprint1("(Clock time is %d.)\n",ticks.l);
@.Clock time is...@>
exit(-2);
}
@ The data structures of this program are not precisely equivalent to
logical gates that could be implemented directly in silicon;
we will use data structures and
algorithms appropriate to the \CEE/ programming language. For example,
we'll use pointers and arrays, instead of buses and ports and latches. However,
the net effect of our data structures and algorithms is intended to
be equivalent to the net effect of a silicon implementation. The methods
used below are essentially equivalent to those used in real machines today,
except that diagnostic facilities are added so that we can readily
watch what is happening.
Each functional unit in the \MMIX\ pipeline is programmed here as a coroutine
in~\CEE/. At every clock cycle, we will call on each active coroutine to do one
phase of its operation; in terms of the repair-station analogy
described in the main program,
this corresponds to getting each group of
auto mechanics to do one unit of operation on a car.
The coroutines are performed sequentially, although
a real pipeline would have them act in parallel.
We will not ``cheat'' by letting one coroutine access a value early in its
cycle that another one computes late in its cycle, unless computer hardware
could ``cheat'' in an equivalent way.
@* Low-level routines. Where should we begin? It is tempting to start with a
global view of the simulator and then to break it down into component parts.
But that task is too daunting, because there are so many unknowns about what
basic ingredients ought to be combined when we construct the larger
components. So let us look first at the primitive operations on which
the superstructure will be built. Once we have created some infrastructure,
we'll be able to proceed with confidence to the larger tasks ahead.
@ This program for the 64-bit \MMIX\ architecture is based on 32-bit integer
arithmetic, because nearly every computer available to the author at the time
of writing (1998--1999) was limited in that way.
Details of the basic arithmetic appear in a separate program module
called {\mc MMIX-ARITH}, because the same routines are needed also
for the assembler and for the non-pipelined simulator. The
definition of type \&{tetra} should be changed, if necessary, to conform with
the definitions found there.
@^system dependencies@>
@<Type...@>=
typedef unsigned int tetra;
/* for systems conforming to the LP-64 data model */
typedef struct { tetra h,l;} octa; /* two tetrabytes make one octabyte */
@ @<Internal proto...@>=
static void print_octa @,@,@[ARGS((octa))@];
@ @<Sub...@>=
static void print_octa(o)
octa o;
{
if (o.h) printf("%x%08x",o.h,o.l);@+
else printf("%x",o.l);
}
@ @<Glob...@>=
extern octa zero_octa; /* |zero_octa.h=zero_octa.l=0| */
extern octa neg_one; /* |neg_one.h=neg_one.l=-1| */
extern octa aux; /* auxiliary output of a subroutine */
extern bool overflow; /* set by certain subroutines for signed arithmetic */
extern int exceptions; /* bits set by floating point operations */
extern int cur_round; /* the current rounding mode */
@ Most of the subroutines in {\mc MMIX-ARITH} return an octabyte as
a function of two octabytes; for example, |oplus(y,z)| returns the
sum of octabytes |y| and~|z|. Multiplication returns the high
half of a product in the global variable~|aux|; division returns
the remainder in~|aux|.
@<Sub...@>=
extern octa oplus @,@,@[ARGS((octa y,octa z))@];
/* unsigned $y+z$ */
extern octa ominus @,@,@[ARGS((octa y,octa z))@];
/* unsigned $y-z$ */
extern octa incr @,@,@[ARGS((octa y,int delta))@];
/* unsigned $y+\delta$ ($\delta$ is signed) */
extern octa oand @,@,@[ARGS((octa y,octa z))@];
/* $y\land z$ */
extern octa oandn @,@,@[ARGS((octa y,octa z))@];
/* $y\land \bar z$ */
extern octa shift_left @,@,@[ARGS((octa y,int s))@];
/* $y\LL s$, $0\le s\le64$ */
extern octa shift_right @,@,@[ARGS((octa y,int s,int u))@];
/* $y\GG s$, signed if |!u| */
extern octa omult @,@,@[ARGS((octa y,octa z))@];
/* unsigned $(|aux|,x)=y\times z$ */
extern octa signed_omult @,@,@[ARGS((octa y,octa z))@];
/* signed $x=y\times z$, setting |overflow| */
extern octa odiv @,@,@[ARGS((octa x,octa y,octa z))@];
/* unsigned $(x,y)/z$; $|aux|=(x,y)\bmod z$ */
extern octa signed_odiv @,@,@[ARGS((octa y,octa z))@];
/* signed $y/z$, when $z\ne0$; $|aux|=y\bmod z$ */
extern int count_bits @,@,@[ARGS((tetra z))@];
/* $x=\nu(z)$ */
extern tetra byte_diff @,@,@[ARGS((tetra y,tetra z))@];
/* half of \.{BDIF} */
extern tetra wyde_diff @,@,@[ARGS((tetra y,tetra z))@];
/* half of \.{WDIF} */
extern octa bool_mult @,@,@[ARGS((octa y,octa z,bool xor))@];
/* \.{MOR} or \.{MXOR} */
extern octa load_sf @,@,@[ARGS((tetra z))@];
/* load short float */
extern tetra store_sf @,@,@[ARGS((octa x))@];
/* store short float */
extern octa fplus @,@,@[ARGS((octa y,octa z))@];
/* floating point $x=y\oplus z$ */
extern octa fmult @,@,@[ARGS((octa y ,octa z))@];
/* floating point $x=y\otimes z$ */
extern octa fdivide @,@,@[ARGS((octa y,octa z))@];
/* floating point $x=y\oslash z$ */
extern octa froot @,@,@[ARGS((octa,int))@];
/* floating point $x=\sqrt z$ */
extern octa fremstep @,@,@[ARGS((octa y,octa z,int delta))@];
/* floating point $x\,{\rm rem}\,z=y\,{\rm rem}\,z$ */
extern octa fintegerize @,@,@[ARGS((octa z,int mode))@];
/* floating point $x={\rm round}(z)$ */
extern int fcomp @,@,@[ARGS((octa y,octa z))@];
/* $-1$, 0, 1, or 2 if $y<z$, $y=z$, $y>z$, $y\parallel z$ */
extern int fepscomp @,@,@[ARGS((octa y,octa z,octa eps,int sim))@];
/* $x=|sim|?\ [y\sim z\ (\epsilon)]:\ [y\approx z\ (\epsilon)]$ */
extern octa floatit @,@,@[ARGS((octa z,int mode,int unsgnd,int shrt))@];
/* fix to float */
extern octa fixit @,@,@[ARGS((octa z,int mode))@];
/* float to fix */
@ We had better check that our 32-bit assumption holds.
@<Initialize e...@>=
if (shift_left(neg_one,1).h!=0xffffffff)
panic(errprint0("Incorrect implementation of type tetra"));
@.Incorrect implementation...@>
@* Coroutines. As stated earlier, this program can be regarded as a system of
interacting coroutines. Coroutines---sometimes called threads---are more or
less independent processes that share and pass data and control back and
forth. They correspond to the individual workers in an organization.
We don't need the full power of recursive coroutines, in which new threads are
spawned dynamically and have independent stacks for computation; we are, after
all, simulating a fixed piece of hardware. The total number of coroutines we
deal with is established once and for all by the |MMIX_config| routine, and
each coroutine has a fixed amount of local data.
The simulation operates one clock tick at a time, by executing all
coroutines scheduled for time~$t$ before advancing to time~$t+1$. The
coroutines at time~$t$ may decide to become dormant or they may reschedule
themselves and/or other coroutines for future times.
Each coroutine has a symbolic |name| for diagnostic purposes (e.g.,
\.{ALU1}); a nonnegative |stage| number (e.g., 2~for the second stage
of a pipeline); a pointer to the next coroutine scheduled at the same time (or
|NULL| if the coroutine is unscheduled); a pointer to a lock variable
(or |NULL| if no lock is currently relevant);
and a reference to a control block containing the data to be processed.
@s control_struct int
@<Type...@>=
typedef struct coroutine_struct {
char *name; /* symbolic identification of a coroutine */
int stage; /* its rank */
struct coroutine_struct *next; /* its successor */
struct coroutine_struct **lockloc; /* what it might be locking */
struct control_struct *ctl; /* its data */
} coroutine;
@ @<Internal proto...@>=
static void print_coroutine_id @,@,@[ARGS((coroutine*))@];
static void errprint_coroutine_id @,@,@[ARGS((coroutine*))@];
@ @<Sub...@>=
static void print_coroutine_id(c)
coroutine *c;
{
if (c) printf("%s:%d",c->name,c->stage);
else printf("??");
}
@#
static void errprint_coroutine_id(c)
coroutine *c;
{
if (c) errprint2("%s:%d",c->name,c->stage);
else errprint0("??");
@.??@>
}
@ Coroutine control is masterminded by a ring of queues, one each for
times $t$, $t+1$, \dots, $t+|ring_size|-1$, when $t$ is the current
clock time.
All scheduling is first-come-first-served, except that coroutines with higher
|stage| numbers have priority. We want to process the later stages of a
pipeline first, in this sequential implementation, for the same reason that a
car must drive from M~station into W~station before another car can enter
M~station.
Each queue is a circular list of \&{coroutine} nodes, linked together by their
|next| fields. A list head~$h$ with |stage=max_stage| comes at the end and the
beginning of the queue. (All |stage| numbers of legitimate coroutines
are less than~|max_stage|.) The queued items are |h->next|, |h->next->next|,
etc., from back to front, and we have |c->stage<=c->next->stage| unless |c=h|.
Initially all queues are empty.
@<Initialize e...@>=
{@+register coroutine *p;
for (p=ring;p<ring+ring_size;p++) p->next=p;
}
@ To schedule a coroutine |c| with positive delay |d<ring_size|, we call
|schedule(c,d,s)|. (The |s| parameter is used only if scheduling is
being logged; it does not affect the computation, but we will
generally set |s| to the state at which the scheduled coroutine will begin.)
@<Internal proto...@>=
static void schedule @,@,@[ARGS((coroutine*,int,int))@];
@ @<Sub...@>=
static void schedule(c,d,s)
coroutine *c;
int d,s;
{
register int tt=(cur_time+d)%ring_size;
register coroutine *p=&ring[tt]; /* start at the list head */
if (d<=0 || d>=ring_size) /* do a sanity check */
panic(confusion("Scheduling ");errprint_coroutine_id(c);
errprint1(" with delay %d",d));
while (p->next->stage<c->stage) p=p->next;
c->next = p->next;
p->next = c;
if (verbose&schedule_bit) {
printf(" scheduling ");@+print_coroutine_id(c);
printf(" at time %d, state %d\n",ticks.l+d,s);
}
}
@ @<External var...@>=
Extern int ring_size; /* set by |MMIX_config|, must be sufficiently large */
Extern coroutine *ring;
Extern int cur_time;
@ The all-important |ctl| field of a coroutine, which contains the
data being manipulated, will be explained below. One of its key
components is the |state| field, which helps to specify the next
actions the coroutine will perform. When we schedule a coroutine for
a new task, we often want it to begin in state~0.
@<Internal proto...@>=
static void startup @,@,@[ARGS((coroutine*,int))@];
@ @<Sub...@>=
static void startup(c,d)
coroutine *c;
int d;
{
c->ctl->state=0;
schedule(c,d,0);
}
@ The following routine removes a coroutine from whatever queue it's in.
The case |c->next=c| is also permitted; such a self-loop can occur when a
coroutine goes to sleep and expects to be awakened (that is, scheduled)
by another coroutine. Sleeping coroutines have important data in their
|ctl| field; they are therefore quite different from unscheduled
or ``unemployed'' coroutines, which have |c->next=NULL|. An unemployed
coroutine is not assumed to have any valid data in its |ctl| field.
@<Internal proto...@>=
static void unschedule @,@,@[ARGS((coroutine*))@];
@ @<Sub...@>=
static void unschedule(c)
coroutine *c;
{@+register coroutine *p;
if (c->next) {
for (p=c; p->next!=c; p=p->next) ;
p->next = c->next;
c->next=NULL;
if (verbose&schedule_bit) {
printf(" unscheduling ");@+print_coroutine_id(c);@+printf("\n");
}
}
}
@ When it is time to process all coroutines that have queued up for a
particular time~|t|, we empty the queue called |ring[t]| and link its items in
the opposite order (from front to back). The following subroutine uses the
well known algorithm discussed in exercise 2.2.3--7 of {\sl The Art
of Computer Programming}.
@<Internal proto...@>=
static coroutine *queuelist @,@,@[ARGS((int))@];
@ @<Sub...@>=
static coroutine* queuelist(t)
int t;
{@+register coroutine *p, *q=&sentinel, *r;
for (p=ring[t].next;p!=&ring[t];p=r) {
r=p->next;
p->next=q;
q=p;
}
ring[t].next=&ring[t];
sentinel.next=q;
return q;
}
@ @<Glob...@>=
coroutine sentinel; /* dummy coroutine at origin of circular list */
@ Coroutines often start working on tasks that are {\it speculative}, in the
sense that we want certain results to be ready if they prove to be
useful; we understand that speculative computations might not actually
be needed. Therefore a coroutine might need to be aborted before it
has finished its work.
All coroutines must be written in such a way that important data structures
remain intact even when the coroutine is abruptly terminated. In particular,
we need to be sure that ``locks'' on shared resources are restored to
an unlocked state when a coroutine holding the lock is aborted.
A \&{lockvar} variable is |NULL| when it is unlocked; otherwise it
points to the coroutine responsible for unlocking~it.
@d set_lock(c,l) {@+l=c;@+(c)->lockloc=&(l);@+}
@d release_lock(c,l) {@+l=NULL;@+ (c)->lockloc=NULL;@+}
@<Type...@>=
typedef coroutine *lockvar;
@ @<External proto...@>=
Extern void print_locks @,@,@[ARGS((void))@];
@ @<External r...@>=
void print_locks()
{
print_cache_locks(ITcache);
print_cache_locks(DTcache);
print_cache_locks(Icache);
print_cache_locks(Dcache);
print_cache_locks(Scache);
if (mem_lock) printf("mem locked by %s:%d\n",mem_lock->name,mem_lock->stage);
if (dispatch_lock) printf("dispatch locked by %s:%d\n",
dispatch_lock->name,dispatch_lock->stage);
if (wbuf_lock) printf("head of write buffer locked by %s:%d\n",
wbuf_lock->name,wbuf_lock->stage);
if (clean_lock) printf("cleaner locked by %s:%d\n",
clean_lock->name,clean_lock->stage);
if (speed_lock) printf("write buffer flush locked by %s:%d\n",
speed_lock->name,speed_lock->stage);
}
@ Many of the quantities we deal with are speculative values
that might not yet have been certified as part of the ``real''
calculation; in fact, they might not yet have been calculated.
A \&{spec} consists of a 64-bit quantity |o| and a pointer~|p| to
a \&{specnode}. The value~|o| is meaningful only if the
pointer~|p| is~|NULL|; otherwise |p| points to a source of further information.
A \&{specnode} is a 64-bit quantity |o| together with links to other
\&{specnode}s
that are above it or below it in a doubly linked list. An additional
|known| bit tells whether the |o|~field has been calculated. There also is
a 64-bit |addr| field, to identify the list and give further information.
A \&{specnode} list keeps track of speculative values related to a specific
register or to all of main memory; we will discuss such lists in detail~later.
@s specnode_struct int
@<Type...@>=
typedef struct {
octa o;
struct specnode_struct *p;
} spec;
@#
typedef struct specnode_struct {
octa o;
bool known;
octa addr;
struct specnode_struct *up,*down;
} specnode;
@ @<Glob...@>=
spec zero_spec; /* |zero_spec.o.h=zero_spec.o.l=0| and |zero_spec.p=NULL| */
@ @<Internal proto...@>=
static void print_spec @,@,@[ARGS((spec))@];
@ @<Sub...@>=
static void print_spec(s)
spec s;
{
if (!s.p) print_octa(s.o);
else {
printf(">");@+ print_specnode_id(s.p->addr);
}
}
@#
static void print_specnode(s)
specnode s;
{
if (s.known) {@+print_octa(s.o);@+printf("!");@+}
else if (s.o.h || s.o.l) {@+print_octa(s.o);@+printf("?");@+}
else printf("?");
print_specnode_id(s.addr);
}
@ The analog of an automobile in our simulator is a block of data called
\&{control}, which represents all the relevant facts about an \MMIX\
instruction. We can think of it as the work order attached to a car's
windshield. Each group of employees updates the work order as the car moves
through the shop.
A \&{control} record contains the original location of an instruction,
and its four bytes OP~X~Y~Z. An instruction has up to four inputs, which are
\&{spec} records called |y|, |z|, |b| and~|ra|; it also has up to three
outputs, which are \&{specnode} records called |x|, |a|, and~|rl|.
(We usually don't mention the special input~|ra| or the special output~|rl|,
which refer to \.{MMIX}'s internal registers rA and~rL.) For example, the
main inputs to a \.{DIVU} command are \$Y, \$Z, and~rD; the outputs are the
quotient~\$X and the remainder~rR. The inputs to a
\.{STO} command are \$Y, \$Z, and~\$X; there is one ``output,'' and
the field~|x.addr| will be set to the physical address of the memory location
corresponding to virtual address $\rm \$Y+\$Z$.
Each \&{control} block also points to the coroutine that owns it, if any.
And it has various other fields that contain other tidbits of information;
for example, we have already mentioned
the |state|~field, which often governs a coroutine's actions. The |i|~field,
which contains an internal operation code number, is generally used together
with |state| to switch between alternative computational steps. If, for
example, the |op|~field is \.{SUB} or \.{SUBI} or \.{NEG} or \.{NEGI},
the internal opcode~|i| will be simply~|sub|.
We shall define all the fields of \&{control} records
now and discuss them later.
An actual hardware implementation of \MMIX\ wouldn't need all the information
we are putting into a \&{control} block. Some of that information would
typically be latched between stages of a pipeline; other portions would
probably appear in so-called ``rename registers.''
@^rename registers@>
We simulate rename registers only indirectly,
by counting how many registers of that
kind would be in use if we were mimicking low-level hardware details more
precisely. The |go| field is a \&{specnode} for convenience in programming,
although we use only its |known| and |o| subfields. It generally contains
the address of the subsequent instruction.
@s mmix_opcode int
@s internal_opcode int
@<Type...@>=
@<Declare \&{mmix\_opcode} and \&{internal\_opcode}@>@;
typedef struct control_struct {
octa loc; /* virtual address where an instruction originated */
mmix_opcode op;@+ unsigned char xx,yy,zz; /* the original instruction bytes */
spec y,z,b,ra; /* inputs */
specnode x,a,go,rl; /* outputs */
coroutine *owner; /* a coroutine whose |ctl| this is */
internal_opcode i; /* internal opcode */
int state; /* internal mindset */
bool usage; /* should rU be increased? */
bool need_b; /* should we stall until |b.p==NULL|? */
bool need_ra; /* should we stall until |ra.p==NULL|? */
bool ren_x; /* does |x| correspond to a rename register? */
bool mem_x; /* does |x| correspond to a memory write? */
bool ren_a; /* does |a| correspond to a rename register? */
bool set_l; /* does |rl| correspond to a new value of rL? */
bool interim; /* does this instruction need to be reissued on interrupt? */
bool stack_alert; /* is there potential for stack overflow? */
unsigned int arith_exc; /* arithmetic exceptions for event bits of rA */
unsigned int hist; /* history bits for use in branch prediction */
int denin,denout; /* execution time penalties for subnormal handling */
octa cur_O,cur_S; /* speculative rO and rS before this instruction */
unsigned int interrupt; /* does this instruction generate an interrupt? */
void *ptr_a, *ptr_b, *ptr_c; /* generic pointers for miscellaneous use */
} control;
@ @<Internal proto...@>=
static void print_control_block @,@,@[ARGS((control*))@];
@ @<Sub...@>=
static void print_control_block(c)
control *c;
{
octa default_go;
if (c->loc.h || c->loc.l || c->op || c->xx || c->yy || c->zz || c->owner) {
print_octa(c->loc);
printf(": %02x%02x%02x%02x(%s)",c->op,c->xx,c->yy,c->zz,
internal_op_name[c->i]);
}
if (c->usage) printf("*");
if (c->interim) printf("+");
if (c->y.o.h || c->y.o.l || c->y.p) {@+printf(" y=");@+print_spec(c->y);@+}
if (c->z.o.h || c->z.o.l || c->z.p) {@+printf(" z=");@+print_spec(c->z);@+}
if (c->b.o.h || c->b.o.l || c->b.p || c->need_b) {
printf(" b=");@+print_spec(c->b);
if (c->need_b) printf("*");
}
if (c->need_ra) {@+printf(" rA=");@+print_spec(c->ra);@+}
if (c->ren_x || c->mem_x) {@+printf(" x=");@+print_specnode(c->x);@+}
else if (c->x.o.h || c->x.o.l) {
printf(" x=");@+print_octa(c->x.o);@+printf("%c",c->x.known? '!': '?');
}
if (c->ren_a) {@+printf(" a=");@+print_specnode(c->a);@+}
if (c->set_l) {@+printf(" rL=");@+print_specnode(c->rl);@+}
if (c->interrupt) {@+printf(" int=");@+print_bits(c->interrupt);@+}
if (c->arith_exc) {@+printf(" exc=");@+print_bits(c->arith_exc<<8);@+}
default_go=incr(c->loc,4);
if (c->go.o.l!=default_go.l || c->go.o.h!=default_go.h) {
printf(" ->");@+print_octa(c->go.o);
}
if (verbose&show_pred_bit) printf(" hist=%x",c->hist);
if (c->i==pop) {
printf(" rS="); print_octa(c->cur_S);
printf(" rO="); print_octa(c->cur_O);
}
printf(" state=%d",c->state);
}
@* Lists. Here is a (boring) list of all the \MMIX\ opcodes, in order.
@<Declare \&{mmix\_opcode} and \&{internal\_opcode}@>=
typedef enum{@/
@!TRAP,@!FCMP,@!FUN,@!FEQL,@!FADD,@!FIX,@!FSUB,@!FIXU,@/
@!FLOT,@!FLOTI,@!FLOTU,@!FLOTUI,@!SFLOT,@!SFLOTI,@!SFLOTU,@!SFLOTUI,@/
@!FMUL,@!FCMPE,@!FUNE,@!FEQLE,@!FDIV,@!FSQRT,@!FREM,@!FINT,@/
@!MUL,@!MULI,@!MULU,@!MULUI,@!DIV,@!DIVI,@!DIVU,@!DIVUI,@/
@!ADD,@!ADDI,@!ADDU,@!ADDUI,@!SUB,@!SUBI,@!SUBU,@!SUBUI,@/
@!IIADDU,@!IIADDUI,@!IVADDU,@!IVADDUI,@!VIIIADDU,@!VIIIADDUI,@!XVIADDU,@!XVIADDUI,@/
@!CMP,@!CMPI,@!CMPU,@!CMPUI,@!NEG,@!NEGI,@!NEGU,@!NEGUI,@/
@!SL,@!SLI,@!SLU,@!SLUI,@!SR,@!SRI,@!SRU,@!SRUI,@/
@!BN,@!BNB,@!BZ,@!BZB,@!BP,@!BPB,@!BOD,@!BODB,@/
@!BNN,@!BNNB,@!BNZ,@!BNZB,@!BNP,@!BNPB,@!BEV,@!BEVB,@/
@!PBN,@!PBNB,@!PBZ,@!PBZB,@!PBP,@!PBPB,@!PBOD,@!PBODB,@/
@!PBNN,@!PBNNB,@!PBNZ,@!PBNZB,@!PBNP,@!PBNPB,@!PBEV,@!PBEVB,@/
@!CSN,@!CSNI,@!CSZ,@!CSZI,@!CSP,@!CSPI,@!CSOD,@!CSODI,@/
@!CSNN,@!CSNNI,@!CSNZ,@!CSNZI,@!CSNP,@!CSNPI,@!CSEV,@!CSEVI,@/
@!ZSN,@!ZSNI,@!ZSZ,@!ZSZI,@!ZSP,@!ZSPI,@!ZSOD,@!ZSODI,@/
@!ZSNN,@!ZSNNI,@!ZSNZ,@!ZSNZI,@!ZSNP,@!ZSNPI,@!ZSEV,@!ZSEVI,@/
@!LDB,@!LDBI,@!LDBU,@!LDBUI,@!LDW,@!LDWI,@!LDWU,@!LDWUI,@/
@!LDT,@!LDTI,@!LDTU,@!LDTUI,@!LDO,@!LDOI,@!LDOU,@!LDOUI,@/
@!LDSF,@!LDSFI,@!LDHT,@!LDHTI,@!CSWAP,@!CSWAPI,@!LDUNC,@!LDUNCI,@/
@!LDVTS,@!LDVTSI,@!PRELD,@!PRELDI,@!PREGO,@!PREGOI,@!GO,@!GOI,@/
@!STB,@!STBI,@!STBU,@!STBUI,@!STW,@!STWI,@!STWU,@!STWUI,@/
@!STT,@!STTI,@!STTU,@!STTUI,@!STO,@!STOI,@!STOU,@!STOUI,@/
@!STSF,@!STSFI,@!STHT,@!STHTI,@!STCO,@!STCOI,@!STUNC,@!STUNCI,@/
@!SYNCD,@!SYNCDI,@!PREST,@!PRESTI,@!SYNCID,@!SYNCIDI,@!PUSHGO,@!PUSHGOI,@/
@!OR,@!ORI,@!ORN,@!ORNI,@!NOR,@!NORI,@!XOR,@!XORI,@/
@!AND,@!ANDI,@!ANDN,@!ANDNI,@!NAND,@!NANDI,@!NXOR,@!NXORI,@/
@!BDIF,@!BDIFI,@!WDIF,@!WDIFI,@!TDIF,@!TDIFI,@!ODIF,@!ODIFI,@/
@!MUX,@!MUXI,@!SADD,@!SADDI,@!MOR,@!MORI,@!MXOR,@!MXORI,@/
@!SETH,@!SETMH,@!SETML,@!SETL,@!INCH,@!INCMH,@!INCML,@!INCL,@/
@!ORH,@!ORMH,@!ORML,@!ORL,@!ANDNH,@!ANDNMH,@!ANDNML,@!ANDNL,@/
@!JMP,@!JMPB,@!PUSHJ,@!PUSHJB,@!GETA,@!GETAB,@!PUT,@!PUTI,@/
@!POP,@!RESUME,@!SAVE,@!UNSAVE,@!SYNC,@!SWYM,@!GET,@!TRIP}@+@!mmix_opcode;
@ @<Glob...@>=
char *opcode_name[]={@|
"TRAP","FCMP","FUN","FEQL","FADD","FIX","FSUB","FIXU",@|
"FLOT","FLOTI","FLOTU","FLOTUI","SFLOT","SFLOTI","SFLOTU","SFLOTUI",@|
"FMUL","FCMPE","FUNE","FEQLE","FDIV","FSQRT","FREM","FINT",@|
"MUL","MULI","MULU","MULUI","DIV","DIVI","DIVU","DIVUI",@|
"ADD","ADDI","ADDU","ADDUI","SUB","SUBI","SUBU","SUBUI",@|
"2ADDU","2ADDUI","4ADDU","4ADDUI","8ADDU","8ADDUI","16ADDU","16ADDUI",@|
"CMP","CMPI","CMPU","CMPUI","NEG","NEGI","NEGU","NEGUI",@|
"SL","SLI","SLU","SLUI","SR","SRI","SRU","SRUI",@|
"BN","BNB","BZ","BZB","BP","BPB","BOD","BODB",@|
"BNN","BNNB","BNZ","BNZB","BNP","BNPB","BEV","BEVB",@|
"PBN","PBNB","PBZ","PBZB","PBP","PBPB","PBOD","PBODB",@|
"PBNN","PBNNB","PBNZ","PBNZB","PBNP","PBNPB","PBEV","PBEVB",@|
"CSN","CSNI","CSZ","CSZI","CSP","CSPI","CSOD","CSODI",@|
"CSNN","CSNNI","CSNZ","CSNZI","CSNP","CSNPI","CSEV","CSEVI",@|
"ZSN","ZSNI","ZSZ","ZSZI","ZSP","ZSPI","ZSOD","ZSODI",@|
"ZSNN","ZSNNI","ZSNZ","ZSNZI","ZSNP","ZSNPI","ZSEV","ZSEVI",@|
"LDB","LDBI","LDBU","LDBUI","LDW","LDWI","LDWU","LDWUI",@|
"LDT","LDTI","LDTU","LDTUI","LDO","LDOI","LDOU","LDOUI",@|
"LDSF","LDSFI","LDHT","LDHTI","CSWAP","CSWAPI","LDUNC","LDUNCI",@|
"LDVTS","LDVTSI","PRELD","PRELDI","PREGO","PREGOI","GO","GOI",@|
"STB","STBI","STBU","STBUI","STW","STWI","STWU","STWUI",@|
"STT","STTI","STTU","STTUI","STO","STOI","STOU","STOUI",@|
"STSF","STSFI","STHT","STHTI","STCO","STCOI","STUNC","STUNCI",@|
"SYNCD","SYNCDI","PREST","PRESTI","SYNCID","SYNCIDI","PUSHGO","PUSHGOI",@|
"OR","ORI","ORN","ORNI","NOR","NORI","XOR","XORI",@|
"AND","ANDI","ANDN","ANDNI","NAND","NANDI","NXOR","NXORI",@|
"BDIF","BDIFI","WDIF","WDIFI","TDIF","TDIFI","ODIF","ODIFI",@|
"MUX","MUXI","SADD","SADDI","MOR","MORI","MXOR","MXORI",@|
"SETH","SETMH","SETML","SETL","INCH","INCMH","INCML","INCL",@|
"ORH","ORMH","ORML","ORL","ANDNH","ANDNMH","ANDNML","ANDNL",@|
"JMP","JMPB","PUSHJ","PUSHJB","GETA","GETAB","PUT","PUTI",@|
"POP","RESUME","SAVE","UNSAVE","SYNC","SWYM","GET","TRIP"};
@ And here is a (likewise boring) list of all the internal opcodes.
The smallest numbers, less than or equal to |max_pipe_op|, correspond
to operations for which arbitrary pipeline delays can be configured
with |MMIX_config|. The largest numbers, greater than |max_real_command|,
correspond to internally
generated operations that have no official OP code; for example,
there are internal operations to shift the $\gamma$ pointer in the
register stack, and to compute page table entries.
@<Declare \&{mmix\_opcode} and \&{internal\_opcode}@>=
#define max_pipe_op feps
#define max_real_command trip
typedef enum{@/
@!mul0, /* multiplication by zero */
@!mul1, /* multiplication by 1--8 bits */
@!mul2, /* multiplication by 9--16 bits */
@!mul3, /* multiplication by 17--24 bits */
@!mul4, /* multiplication by 25--32 bits */
@!mul5, /* multiplication by 33--40 bits */
@!mul6, /* multiplication by 41--48 bits */
@!mul7, /* multiplication by 49--56 bits */
@!mul8, /* multiplication by 57--64 bits */
@!div, /* \.{DIV[U][I]} */
@!sh, /* \.{S[L,R][U][I]} */
@!mux, /* \.{MUX[I]} */
@!sadd, /* \.{SADD[I]} */
@!mor, /* \.{M[X]OR[I]} */
@!fadd, /* \.{FADD}, \.{FSUB} */
@!fmul, /* \.{FMUL} */
@!fdiv, /* \.{FDIV} */
@!fsqrt, /* \.{FSQRT} */
@!fint, /* \.{FINT} */
@!fix, /* \.{FIX[U]} */
@!flot, /* \.{[S]FLOT[U][I]} */
@!feps, /* \.{FCMPE}, \.{FUNE}, \.{FEQLE} */
@!fcmp, /* \.{FCMP} */
@!funeq, /* \.{FUN}, \.{FEQL} */
@!fsub, /* \.{FSUB} */
@!frem, /* \.{FREM} */
@!mul, /* \.{MUL[I]} */
@!mulu, /* \.{MULU[I]} */
@!divu, /* \.{DIVU[I]} */
@!add, /* \.{ADD[I]} */
@!addu, /* \.{[2,4,8,16,]ADDU[I]}, \.{INC[M][H,L]} */
@!sub, /* \.{SUB[I]}, \.{NEG[I]} */
@!subu, /* \.{SUBU[I]}, \.{NEGU[I]} */
@!set, /* \.{SET[M][H,L]}, \.{GETA[B]} */
@!or, /* \.{OR[I]}, \.{OR[M][H,L]} */
@!orn, /* \.{ORN[I]} */
@!nor, /* \.{NOR[I]} */
@!and, /* \.{AND[I]} */
@!andn, /* \.{ANDN[I]}, \.{ANDN[M][H,L]} */
@!nand, /* \.{NAND[I]} */
@!xor, /* \.{XOR[I]} */
@!nxor, /* \.{NXOR[I]} */
@!shlu, /* \.{SLU[I]} */
@!shru, /* \.{SRU[I]} */
@!shl, /* \.{SL[I]} */
@!shr, /* \.{SR[I]} */
@!cmp, /* \.{CMP[I]} */
@!cmpu, /* \.{CMPU[I]} */
@!bdif, /* \.{BDIF[I]} */
@!wdif, /* \.{WDIF[I]} */
@!tdif, /* \.{TDIF[I]} */
@!odif, /* \.{ODIF[I]} */
@!zset, /* \.{ZS[N][N,Z,P][I]}, \.{ZSEV[I]}, \.{ZSOD[I]} */
@!cset, /* \.{CS[N][N,Z,P][I]}, \.{CSEV[I]}, \.{CSOD[I]} */
@!get, /* \.{GET} */
@!put, /* \.{PUT[I]} */
@!ld, /* \.{LD[B,W,T,O][U][I]}, \.{LDHT[I]}, \.{LDSF[I]} */
@!ldptp, /* load page table pointer */
@!ldpte, /* load page table entry */
@!ldunc, /* \.{LDUNC[I]} */
@!ldvts, /* \.{LDVTS[I]} */
@!preld, /* \.{PRELD[I]} */
@!prest, /* \.{PREST[I]} */
@!st, /* \.{STO[U][I]}, \.{STCO[I]}, \.{STUNC[I]} */
@!syncd, /* \.{SYNCD[I]} */
@!syncid, /* \.{SYNCID[I]} */
@!pst, /* \.{ST[B,W,T][U][I]}, \.{STHT[I]} */
@!stunc, /* \.{STUNC[I]}, in write buffer */
@!cswap, /* \.{CSWAP[I]} */
@!br, /* \.{B[N][N,Z,P][B]} */
@!pbr, /* \.{PB[N][N,Z,P][B]} */
@!pushj, /* \.{PUSHJ[B]} */
@!go, /* \.{GO[I]} */
@!prego, /* \.{PREGO[I]} */
@!pushgo, /* \.{PUSHGO[I]} */
@!pop, /* \.{POP} */
@!resume, /* \.{RESUME} */
@!save, /* \.{SAVE} */
@!unsave, /* \.{UNSAVE} */
@!sync, /* \.{SYNC} */
@!jmp, /* \.{JMP[B]} */
@!noop, /* \.{SWYM} */
@!trap, /* \.{TRAP} */
@!trip, /* \.{TRIP} */
@!incgamma, /* increase $\gamma$ pointer */
@!decgamma, /* decrease $\gamma$ pointer */
@!incrl, /* increase rL and $\beta$ */
@!sav, /* intermediate stage of \.{SAVE} */
@!unsav, /* intermediate stage of \.{UNSAVE} */
@!resum /* intermediate stage of \.{RESUME} */
}@! internal_opcode;
@ @<Glob...@>=
char *internal_op_name[]={
"mul0",
"mul1",
"mul2",
"mul3",
"mul4",
"mul5",
"mul6",
"mul7",