Skip to content

Commit

Permalink
Separate program and data in byte code interpreter
Browse files Browse the repository at this point in the history
This gives a substantial performance improvement, from second slowest to fastest.
  • Loading branch information
noelwelsh committed Dec 10, 2023
1 parent b0795f1 commit 958529e
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 58 deletions.
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@ A series of implementatinos of stack machines to optimize evaluation of arithmet
Example results

```
[info] FibonnaciBenchmark.baseFibBenchmark thrpt 25 2748.000 ± 17.741 ops/s
[info] FibonnaciBenchmark.basicStackFibBenchmark thrpt 25 644.858 ± 32.125 ops/s
[info] FibonnaciBenchmark.byteCodeFibBenchmark thrpt 25 1675.357 ± 5.391 ops/s
[info] FibonnaciBenchmark.optimizedStack2FibBenchmark thrpt 25 3563.952 ± 71.645 ops/s
[info] FibonnaciBenchmark.optimizedStack3FibBenchmark thrpt 25 3540.126 ± 51.814 ops/s
[info] FibonnaciBenchmark.optimizedStackFibBenchmark thrpt 25 3630.538 ± 12.488 ops/s
[info] FibonnaciBenchmark.stackCachingFibBenchmark thrpt 25 3508.967 ± 234.853 ops/s
[info] FibonnaciBenchmark.superInstructionFibBenchmark thrpt 25 3839.907 ± 452.660 ops/s
[info] Benchmark Mode Cnt Score Error Units
[info] FibonnaciBenchmark.baseFibBenchmark thrpt 25 2754.433 ± 18.351 ops/s
[info] FibonnaciBenchmark.basicStackFibBenchmark thrpt 25 676.426 ± 18.061 ops/s
[info] FibonnaciBenchmark.byteCodeFibBenchmark thrpt 25 4057.114 ± 69.842 ops/s
[info] FibonnaciBenchmark.optimizedStack2FibBenchmark thrpt 25 3575.290 ± 45.085 ops/s
[info] FibonnaciBenchmark.optimizedStack3FibBenchmark thrpt 25 3567.815 ± 50.533 ops/s
[info] FibonnaciBenchmark.optimizedStackFibBenchmark thrpt 25 3631.189 ± 8.750 ops/s
[info] FibonnaciBenchmark.stackCachingFibBenchmark thrpt 25 3698.103 ± 108.849 ops/s
[info] FibonnaciBenchmark.superInstructionFibBenchmark thrpt 25 1239.706 ± 364.866 ops/s
```

Super instruction benchmarks are very volatile.
106 changes: 56 additions & 50 deletions core/src/main/scala/arithmetic/ByteCode.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package arithmetic

import java.nio.ByteBuffer

object ByteCode {

enum Expression extends arithmetic.Expression[Expression] {
Expand All @@ -17,39 +15,56 @@ object ByteCode {
def /(that: Expression): Expression = Division(this, that)

def compile: Program = {
val p = Array.ofDim[Byte](16777216)
val buffer = ByteBuffer.wrap(p)
var limit = 0
// We store literals in the data array, and program bytecodes in the
// program array. As there is no control flow in a program, the order in
// which literals are written in the data array is the order in which the
// program reads them out.
//
// Separating program and data leads to substantial performance
// improvements. An earlier iteration, that used a ByteBuffer and stored
// program and data together, had performance under half of that reported
// here.
val program = Array.ofDim[Byte](16777216)
val data = Array.ofDim[Double](16777216)

// Program pointer. First free element
var pp = 0
// Data pointer. First free element
var dp = 0
def loop(expr: Expression): Unit =
expr match {
case Literal(value) =>
buffer.put(Op.Lit.ordinal.toByte)
buffer.putDouble(value)
limit = limit + 8 + 1
program(pp) = Op.Lit
data(dp) = value
pp = pp + 1
dp = dp + 1
case Addition(left, right) =>
loop(left)
loop(right)
buffer.put(Op.Add.ordinal.toByte)
limit = limit + 1
program(pp) = Op.Add
pp = pp + 1
case Subtraction(left, right) =>
loop(left)
loop(right)
buffer.put(Op.Sub.ordinal.toByte)
limit = limit + 1
program(pp) = Op.Sub
pp = pp + 1
case Multiplication(left, right) =>
loop(left)
loop(right)
buffer.put(Op.Mul.ordinal.toByte)
limit = limit + 1
program(pp) = Op.Mul
pp = pp + 1
case Division(left, right) =>
loop(left)
loop(right)
buffer.put(Op.Div.ordinal.toByte)
limit = limit + 1
program(pp) = Op.Div
pp = pp + 1
}

loop(this)
Program(buffer, limit)
// Shrink to size
val p = IArray.unsafeFromArray(program.slice(0, pp))
val d = IArray.unsafeFromArray(data.slice(0, dp))
Program(p, d)
}

def eval: Double = compile.eval
Expand All @@ -58,66 +73,57 @@ object ByteCode {
def literal(value: Double): Expression = Literal(value)
}

enum Op {
case Lit
case Add
case Sub
case Mul
case Div
object Op {
val Lit: Byte = 0
val Add: Byte = 1
val Sub: Byte = 2
val Mul: Byte = 3
val Div: Byte = 4
}

final case class Program(program: ByteBuffer, limit: Int) {
val machine = new StackMachine(program, limit)
final case class Program(program: IArray[Byte], data: IArray[Double]) {
val machine = new StackMachine(program, data)

def eval: Double = machine.eval
}

final case class StackMachine(program: ByteBuffer, limit: Int) {
final case class StackMachine(program: IArray[Byte], data: IArray[Double]) {
// The data stack
private val stack: Array[Double] = Array.ofDim[Double](256)

object code {
val lit = Op.Lit.ordinal.toByte
val add = Op.Add.ordinal.toByte
val sub = Op.Sub.ordinal.toByte
val mul = Op.Mul.ordinal.toByte
val div = Op.Div.ordinal.toByte
}

final def eval: Double = {
val p = program.array()
// sp points to first free element on the stack
// stack(sp - 1) is the first element
def loop(sp: Int, pc: Int): Double =
if (pc == limit) stack(sp - 1)
def loop(sp: Int, dp: Int, pc: Int): Double =
if (pc == program.size) stack(sp - 1)
else
p(pc) match {
case code.lit =>
stack(sp) = program.getDouble(pc + 1)
loop(sp + 1, pc + 1 + 8)
case code.add =>
program(pc) match {
case Op.Lit =>
stack(sp) = data(dp)
loop(sp + 1, dp + 1, pc + 1)
case Op.Add =>
val a = stack(sp - 1)
val b = stack(sp - 2)
stack(sp - 2) = (a + b)
loop(sp - 1, pc + 1)
case code.sub =>
loop(sp - 1, dp, pc + 1)
case Op.Sub =>
val a = stack(sp - 1)
val b = stack(sp - 2)
stack(sp - 2) = (a - b)
loop(sp - 1, pc + 1)
case code.mul =>
loop(sp - 1, dp, pc + 1)
case Op.Mul =>
val a = stack(sp - 1)
val b = stack(sp - 2)
stack(sp - 2) = (a * b)
loop(sp - 1, pc + 1)
case code.div =>
loop(sp - 1, dp, pc + 1)
case Op.Div =>
val a = stack(sp - 1)
val b = stack(sp - 2)
stack(sp - 2) = (a / b)
loop(sp - 1, pc + 1)
loop(sp - 1, dp, pc + 1)
}

loop(0, 0)
loop(0, 0, 0)
}
}
}

0 comments on commit 958529e

Please sign in to comment.