adamnew123456 · March 8, 2020 21:53 · Mar 8, 2020
diff --git a/rpnjit-1.md b/rpnjit-1.md
@@ -0,0 +1,780 @@
+# A Basic RPN Calculator
+
+We're going to start with a basic RPN interpreter and add function definitions
+to it. That will serve as a good starting point for adding on JIT compilation
+later. There are only going to be a few primitives to start with:
+
+- Integer constants
+- Arithmetic operators: `+`, `-`, `*`, `/`, `%`
+- Stack manipulation: `swap`, `dup`, `drop`
+- Display: `print`
+
+These are simple enough to implement if we don't mind ignoring a lot of error
+handling:
+
+```
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define LINE_SZ 4096
+
+typedef enum {
+    WORD_NONE,
+    WORD_INT,
+    WORD_COMMAND
+} wordtype_t;
+
+typedef struct {
+    int vals[LINE_SZ];
+    int idx;
+} stack_t;
+
+void eval_command(stack_t *stack, char *command) {
+    if (!strcmp(command, "+")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a + b;
+    } else if (!strcmp(command, "-")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a - b;
+    } else if (!strcmp(command, "*")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a * b;
+    } else if (!strcmp(command, "/")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a / b;
+    } else if (!strcmp(command, "%")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a % b;
+    } else if (!strcmp(command, "swap")) {
+        int b = stack->vals[stack->idx - 1];
+        int a = stack->vals[stack->idx - 2];
+        stack->vals[stack->idx - 1] = a;
+        stack->vals[stack->idx - 2] = b;
+    } else if (!strcmp(command, "dup")) {
+        int a = stack->vals[stack->idx - 1];
+        stack->vals[stack->idx++] = a;
+    } else if (!strcmp(command, "drop")) {
+        stack->idx--;
+    } else if (!strcmp(command, "print")) {
+        for (int i = 0; i < stack->idx; i++) {
+            printf("[%d] = %d\n", i, stack->vals[i]);
+        }
+    }
+}
+
+int main() {
+    char line[LINE_SZ];
+    char command[128];
+    int command_idx = 0;
+
+    int number = 0;
+    stack_t stack;
+    stack.idx = 0;
+
+    while (fgets(line, LINE_SZ, stdin) != NULL) {
+        bool line_done = false;
+        bool word_done = false;
+        wordtype_t word_type = WORD_NONE;
+
+        int i = 0;
+        while (!line_done) {
+            char ch = line[i];
+
+            if (!ch) {
+                line_done = true;
+                word_done = true;
+            } else if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
+                word_done = true;
+            } else if (ch >= '0' && ch <= '9') {
+                word_type = WORD_INT;
+                number = (number * 10) + (ch - '0');
+            } else {
+                word_type = WORD_COMMAND;
+                command[command_idx++] = ch;
+            }
+
+            if (word_done) {
+                switch (word_type) {
+                case WORD_INT:
+                    stack.vals[stack.idx++] = number;
+                    break;
+                case WORD_COMMAND:
+                    command[command_idx++] = 0;
+                    eval_command(&stack, command);
+                    break;
+                case WORD_NONE:
+                    break;
+                }
+
+                number = 0;
+                command_idx = 0;
+                word_type = WORD_NONE;
+                word_done = false;
+            }
+
+            i++;
+        }
+    }
+
+    return 0;
+}
+```
+
+After compiling this, we have a basic calculator that's ready to use. For
+example, we can compute the factorial of 5 by hand:
+
+```
+$ gcc rpn1.c -o rpn1
+$ ./rpn1
+1 2 3 4 5
+print
+[0] = 1
+[1] = 2
+[2] = 3
+[3] = 4
+[4] = 5
+* * * *
+print
+[0] = 120
+```
+
+# Allowing User Definitions
+
+There are already a few things here which will have to change in order to allow
+users to define their own commands:
+
+- Right now all commands are executed as they are entered. In order to define
+  commands, we'll have to support not only the *immediate* mode that we support
+  now, but also a *compile* mode where we buffer commands within a function
+  definition instead of running them.
+
+- Right now commands are only run within the fixed `eval_command` function,
+  which statically dispatches on command names into corresponding parts of code.
+  We'll have to have keep something like this since the primitives will still be
+  there, but there will also have to be a place to store function definitions.
+
+- Right now all commands are implemented implicitly using code defined within
+  the program itself. The work of implementing these commands on a lower level
+  is done by the C compiler; for example, when we implement `+` we're relying on
+  the compiler to generate the right code and run it within the branch that tests
+  that the command really is `+`.
+
+  User-defined commands can't be run this way because they won't be implemented in
+  C (even if the primitives they depend upon will be). We'll have to add an interpreter
+  to execute those user-defined commands, and how it works will depend upon how
+  user-defined functions are stored.
+
+To address each of these concerns, we're going to use a simple representation
+that lets us reuse a lot of the program as it exists now:
+
+- The *immediate* mode syntax will be nothing special, just the same commands
+  that we usually run without any extra adornment. We're going to activate
+  *compile* mode using a syntax similar to Forth, where `:` starts a definition
+  and it goes to the end of the line. For example, `: incr 1 +` defines a word
+  `incr` which can be used to increment the top value on the stack.
+
+- We can add a simple list that stores both command names and the code
+  associated with them (what that code actually looks like will be covered in
+  the next point). To find a definition, we just have to iterate this list and
+  check each command name. There are more efficient ways to do this but we're
+  shooting for brevity here.
+
+- We can reuse most of the existing command evaluator if we store user-defined
+  commands as text. We'll still have to rework the dispatching mechanism as
+  described in the last point, but the parsing will mostly stay the same.
+
+With all these changes applied, we get the second iteration of the RPN
+calculator. Again, this ignores lots of error handling as well as not covering
+cases which might be useful, like redefining commands within the same session.
+
+```  
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define LINE_SZ 4096
+#define COMMAND_SZ 128
+#define COMMAND_CNT 32
+
+typedef enum {
+    WORD_NONE,
+    WORD_INT,
+    WORD_COMMAND
+} wordtype_t;
+
+typedef struct {
+    int vals[LINE_SZ];
+    int idx;
+} stack_t;
+
+typedef struct {
+    bool used;
+    char name[COMMAND_SZ];
+    char defn[LINE_SZ];
+} userdef_t;
+
+void eval_primitive(stack_t *stack, char *command) {
+    if (!strcmp(command, "+")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a + b;
+    } else if (!strcmp(command, "-")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a - b;
+    } else if (!strcmp(command, "*")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a * b;
+    } else if (!strcmp(command, "/")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a / b;
+    } else if (!strcmp(command, "%")) {
+        int b = stack->vals[--stack->idx];
+        int a = stack->vals[--stack->idx];
+        stack->vals[stack->idx++] = a % b;
+    } else if (!strcmp(command, "swap")) {
+        int b = stack->vals[stack->idx - 1];
+        int a = stack->vals[stack->idx - 2];
+        stack->vals[stack->idx - 1] = a;
+        stack->vals[stack->idx - 2] = b;
+    } else if (!strcmp(command, "dup")) {
+        int a = stack->vals[stack->idx - 1];
+        stack->vals[stack->idx++] = a;
+    } else if (!strcmp(command, "drop")) {
+        stack->idx--;
+    } else if (!strcmp(command, "print")) {
+        for (int i = 0; i < stack->idx; i++) {
+            printf("[%d] = %d\n", i, stack->vals[i]);
+        }
+    }
+}
+
+void eval_line(stack_t *stack, userdef_t *defs, char *line) {
+    bool line_done = false;
+    bool word_done = false;
+    wordtype_t word_type = WORD_NONE;
+
+    char command[128];
+    int command_idx = 0;
+    int number = 0;
+
+    int i = 0;
+    while (!line_done) {
+        char ch = line[i];
+
+        if (!ch) {
+            line_done = true;
+            word_done = true;
+        } else if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
+            word_done = true;
+        } else if (ch >= '0' && ch <= '9') {
+            word_type = WORD_INT;
+            number = (number * 10) + (ch - '0');
+        } else {
+            word_type = WORD_COMMAND;
+            command[command_idx++] = ch;
+        }
+
+        if (word_done) {
+            switch (word_type) {
+            case WORD_INT:
+                stack->vals[stack->idx++] = number;
+                break;
+            case WORD_COMMAND: {
+                command[command_idx++] = 0;
+
+                bool found_cmd = false;
+                for (int i = 0; i < COMMAND_CNT; i++) {
+                  if (defs[i].used && !strcmp(command, defs[i].name)) {
+                    eval_line(stack, defs, defs[i].defn);
+                    found_cmd = true;
+                  }
+                }
+
+                if (!found_cmd) {
+                    eval_primitive(stack, command);
+                }
+                break;
+            }
+            case WORD_NONE:
+                break;
+            }
+
+            number = 0;
+            command_idx = 0;
+            word_type = WORD_NONE;
+            word_done = false;
+        }
+
+        i++;
+    }
+}
+
+void compile_line(userdef_t *defs, char *line) {
+  int def_target;
+  for (def_target = 0; def_target < COMMAND_CNT; def_target++) {
+    if (!defs[def_target].used) break;
+  }
+
+  defs[def_target].used = true;
+  int start_cmd;
+  int end_cmd;
+
+  for (int i = 1; i < LINE_SZ; i++) {
+    if (line[i] != ' ' && line[i] != '\n' && line[i] != '\r' && line[i] != '\t') {
+      start_cmd = i;
+      break;
+    }
+  }
+
+  for (int i = start_cmd; i < LINE_SZ; i++) {
+    if (line[i] == ' ' || line[i] == '\n' || line[i] == '\r' || line[i] == '\t') {
+      end_cmd = i;
+      break;
+    } else {
+      defs[def_target].name[i - start_cmd] = line[i];
+    }
+  }
+
+  memcpy(defs[def_target].defn, line + end_cmd, 4096 - end_cmd);
+}
+
+int main() {
+    char line[LINE_SZ];
+    stack_t stack = {0};
+    userdef_t userdefs[COMMAND_CNT] = {0};
+
+    while (fgets(line, LINE_SZ, stdin) != NULL) {
+        if (line[0] == ':') {
+            compile_line(userdefs, line);
+        } else {
+            eval_line(&stack, userdefs, line);
+        }
+    }
+
+    return 0;
+}
+```
+
+Once compiled, we can use this to define and run our own functions!
+
+```
+$ gcc rpn2.c -o rpn2
+$ ./rpn2
+: double dup +
+5 double print
+[0] = 10
+```
+
+# Compiling More Efficiently
+
+Right now the compiled representation is fairly inefficient. Every time you
+invoke a function its code has to be loaded and parsed, which includes wasting
+time on insignificant characters like whitespace.
+
+Instead of working with the raw command line, we can compile all functions into
+a series of basic operations ahead of time and store that. What kind of operations
+do we need?
+
+- Pushing an integer constant
+- Invoking a primitive
+- Invoking a user-defined function
+- Ending a function
+
+We can represent each of these instructions using a simple code, consisting of
+one prefix byte and some trailing data:
+
+```
+bytes
+
+Integer constant:
+
+0        1        2        3        4        5
++--------|--------|--------|--------|--------+
+|    0   |   32-bit integer constant         |
++--------|--------|--------|--------|--------+
+
+0        1        2        3        4        5
++--------|--------|--------+
+|    1   |   16-bit int    |
++--------|--------|--------+
+
+0        1        2
++--------|--------+
+|    2   | 8-bit  |
++--------|--------+
+
+Primitives:
+
+0        1
++--------+
+|    3   | +
++--------+
+|    4   | -
++--------+
+|    5   | *
++--------+
+|    6   | /
++--------+
+|    7   | %
++--------+
+|    8   | swap
++--------+
+|    9   | dup
++--------+
+|   10   | drop
++--------+
+|   11   | print
++--------+
+
+
+User-defined functions:
+
+0        1        2
++--------|--------+
+|   12   |  cmd   |
++--------|--------+
+
+Exit function:
+
+0        1
++--------+
+|   13   |
++--------+
+```
+
+This allows us to take something like the double function from before:
+
+```
+: double dup +
+```
+
+Which was executed by evaluating the 5-byte string `dup +` every time, and
+convert it to this compact 3-byte representation:
+
+```
+{11, 3, 13} 
+```
+
+One thing to note is that we could have defined a simpler encoding for integers
+which stores all values as 32-bit constants. While this would work, this has the
+effect of significantly bloating certain definitions like this alternative
+version of double:
+
+```
+: double 2 *
+```
+
+In this case, the fixed-width 32-bit encoding would produce something like this
+which is larger than the textual version by 5 bytes:
+
+```
+{0, 0, 0, 0, 2, 6, 13}
+```
+
+Instead, we can do much better if we use the more compact integer encoding. This
+is half the size of the original encoding and only one byte larger than the
+textual version:
+
+```
+{2, 2, 6, 13}
+```
+
+Also, it's important to notice that by choosing one byte to use for referencing
+user-defined functions, we're enshrining a limit of 127 functions at a deeper
+layer than we have up until this point. Before it was easy to change that limit
+by adjusting a #define, but now that limit is baked into our function
+representation.
+
+This is the next version of the RPN calculator utilizing a byte-code compiler:
+
+```
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define LINE_SZ 4096
+#define COMMAND_SZ 128
+#define COMMAND_CNT 32
+
+typedef enum { WORD_NONE, WORD_INT, WORD_COMMAND } wordtype_t;
+
+typedef enum {
+  CODE_I32,
+  CODE_I16,
+  CODE_I8,
+  CODE_ADD,
+  CODE_SUB,
+  CODE_MUL,
+  CODE_DIV,
+  CODE_MOD,
+  CODE_SWAP,
+  CODE_DUP,
+  CODE_DROP,
+  CODE_PRINT,
+  CODE_UDF,
+  CODE_EXIT
+} cmdtype_t;
+
+typedef struct {
+  int vals[LINE_SZ];
+  int idx;
+} stack_t;
+
+typedef struct {
+  bool used;
+  char name[COMMAND_SZ];
+  char defn[LINE_SZ];
+} userdef_t;
+
+void exec_code(stack_t *stack, userdef_t *defs, char *codes) {
+  int code_idx = 0;
+
+  while (true) {
+    cmdtype_t command = codes[code_idx++];
+    switch (command) {
+    case CODE_I8: {
+      int value = codes[code_idx++];
+      stack->vals[stack->idx++] = value;
+      break;
+    }
+    case CODE_I16: {
+      int value = 0;
+      value |= (codes[code_idx++] << 8);
+      value |= codes[code_idx++];
+      stack->vals[stack->idx++] = value;
+      break;
+    }
+    case CODE_I32: {
+      int value = 0;
+      value |= (codes[code_idx++] << 24);
+      value |= (codes[code_idx++] << 16);
+      value |= (codes[code_idx++] << 8);
+      value |= codes[code_idx++];
+      stack->vals[stack->idx++] = value;
+      break;
+    }
+    case CODE_ADD: {
+      int b = stack->vals[--stack->idx];
+      int a = stack->vals[--stack->idx];
+      stack->vals[stack->idx++] = a + b;
+      break;
+    }
+    case CODE_SUB: {
+      int b = stack->vals[--stack->idx];
+      int a = stack->vals[--stack->idx];
+      stack->vals[stack->idx++] = a - b;
+      break;
+    }
+    case CODE_MUL: {
+      int b = stack->vals[--stack->idx];
+      int a = stack->vals[--stack->idx];
+      stack->vals[stack->idx++] = a * b;
+      break;
+    }
+    case CODE_DIV: {
+      int b = stack->vals[--stack->idx];
+      int a = stack->vals[--stack->idx];
+      stack->vals[stack->idx++] = a / b;
+      break;
+    }
+    case CODE_MOD: {
+      int b = stack->vals[--stack->idx];
+      int a = stack->vals[--stack->idx];
+      stack->vals[stack->idx++] = a % b;
+      break;
+    }
+    case CODE_SWAP: {
+      int b = stack->vals[stack->idx - 1];
+      int a = stack->vals[stack->idx - 2];
+      stack->vals[stack->idx - 1] = a;
+      stack->vals[stack->idx - 2] = b;
+      break;
+    }
+    case CODE_DUP: {
+      int a = stack->vals[stack->idx - 1];
+      stack->vals[stack->idx++] = a;
+      break;
+    }
+    case CODE_DROP: {
+      stack->idx--;
+      break;
+    }
+    case CODE_PRINT: {
+      for (int i = 0; i < stack->idx; i++) {
+        printf("[%d] = %d\n", i, stack->vals[i]);
+      }
+      break;
+    }
+    case CODE_UDF: {
+      int value = codes[code_idx++];
+      exec_code(stack, defs, defs[value].defn);
+      break;
+    }
+    case CODE_EXIT:
+      return;
+    }
+  }
+}
+
+int compile_command(userdef_t *defs, char *command) {
+  for (int i = 0; i < COMMAND_CNT; i++) {
+    if (defs[i].used && !strcmp(command, defs[i].name)) {
+      return i;
+    }
+  }
+
+  if (!strcmp(command, "+")) {
+    return -CODE_ADD;
+  } else if (!strcmp(command, "-")) {
+    return -CODE_SUB;
+  } else if (!strcmp(command, "*")) {
+    return -CODE_MUL;
+  } else if (!strcmp(command, "/")) {
+    return -CODE_DIV;
+  } else if (!strcmp(command, "%")) {
+    return -CODE_MOD;
+  } else if (!strcmp(command, "swap")) {
+    return -CODE_SWAP;
+  } else if (!strcmp(command, "dup")) {
+    return -CODE_DUP;
+  } else if (!strcmp(command, "drop")) {
+    return -CODE_DROP;
+  } else if (!strcmp(command, "print")) {
+    return -CODE_PRINT;
+  } else {
+    return -CODE_EXIT;
+  }
+}
+
+void compile_line(userdef_t *defs, char *codes, char *line, int cmd_start) {
+  bool line_done = false;
+  bool word_done = false;
+  wordtype_t word_type = WORD_NONE;
+
+  char command[128];
+  int command_idx = 0;
+  int number = 0;
+  int code_idx = 0;
+
+  int i = cmd_start;
+  while (!line_done) {
+    char ch = line[i];
+
+    if (!ch) {
+      line_done = true;
+      word_done = true;
+    } else if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
+      word_done = true;
+    } else if (ch >= '0' && ch <= '9') {
+      word_type = WORD_INT;
+      number = (number * 10) + (ch - '0');
+    } else {
+      word_type = WORD_COMMAND;
+      command[command_idx++] = ch;
+    }
+
+    if (word_done) {
+      switch (word_type) {
+      case WORD_INT:
+        if (abs(number) <= (1 << 7) - 1) {
+          codes[code_idx++] = CODE_I8;
+          codes[code_idx++] = number & 0xFF;
+        } else if (abs(number) <= (1 << 15) - 1) {
+          codes[code_idx++] = CODE_I16;
+          codes[code_idx++] = (number >> 8) & 0xFF;
+          codes[code_idx++] = number & 0xFF;
+        } else {
+          codes[code_idx++] = CODE_I32;
+          codes[code_idx++] = (number >> 24) & 0xFF;
+          codes[code_idx++] = (number >> 16) & 0xFF;
+          codes[code_idx++] = (number >> 8) & 0xFF;
+          codes[code_idx++] = number & 0xFF;
+        }
+        break;
+      case WORD_COMMAND: {
+        command[command_idx++] = 0;
+
+        int command_code = compile_command(defs, command);
+        if (command_code >= 0) {
+          codes[code_idx++] = CODE_UDF;
+          codes[code_idx++] = (char)command_code;
+        } else {
+          codes[code_idx++] = (char)-command_code;
+        }
+        break;
+      }
+      case WORD_NONE:
+        break;
+      }
+
+      number = 0;
+      command_idx = 0;
+      word_type = WORD_NONE;
+      word_done = false;
+    }
+
+    i++;
+  }
+
+  codes[code_idx++] = CODE_EXIT;
+}
+
+int main() {
+  char line[LINE_SZ];
+  char def[COMMAND_SZ];
+  stack_t stack = {0};
+  userdef_t userdefs[COMMAND_CNT] = {0};
+  char line_defn[LINE_SZ];
+
+  while (fgets(line, LINE_SZ, stdin) != NULL) {
+    if (line[0] == ':') {
+      int def_target;
+      for (def_target = 0; def_target < COMMAND_CNT; def_target++) {
+        if (!userdefs[def_target].used)
+          break;
+      }
+
+      userdefs[def_target].used = true;
+      int start_cmd;
+      int end_cmd;
+
+      for (int i = 1; i < LINE_SZ; i++) {
+        if (line[i] != ' ' && line[i] != '\n' && line[i] != '\r' &&
+            line[i] != '\t') {
+          start_cmd = i;
+          break;
+        }
+      }
+
+      for (int i = start_cmd; i < LINE_SZ; i++) {
+        if (line[i] == ' ' || line[i] == '\n' || line[i] == '\r' ||
+            line[i] == '\t') {
+          end_cmd = i;
+          break;
+        } else {
+          userdefs[def_target].name[i - start_cmd] = line[i];
+        }
+      }
+
+      compile_line(userdefs, userdefs[def_target].defn, line, end_cmd);
+    } else {
+      compile_line(userdefs, line_defn, line, 0);
+      exec_code(&stack, userdefs, line_defn);
+    }
+  }
+
+  return 0;
+}
+```
diff --git a/rpnjit-2.md b/rpnjit-2.md
@@ -0,0 +1,1055 @@
+In the previous post we walked through constructing a basic programmable RPN
+calculator, from the most basic form with no user definitions at all, to a more
+advanced interpreter which runs user code after it's been compiled to byte-code.
+
+We can use the byte-code calculator as a skeleton for our JIT calculator, but
+getting to that point requires a lot of build-up to discuss issues of instruction
+encoding and the subtleties of integrating our C driver with code that we're
+going to produce and execute at runtime. 
+
+# x64 Instruction Layouts
+
+There are two big options we can take in determining how user-defined functions
+and primitives work together. One is similar to the method we've been using up
+to this point, where user-defined code is implemented in our target language
+(machine code in this case) and primitives are implemented in C:
+
+```
+Driver (parser/compiler)       C
++-- User-defined code          x64
+    +-- Primitive operations   C
+```
+
+The problem with this approach is that C has a very specific calling convention
+which we would rather avoid embedding into our compiled output. Since the C
+compiler is allowed to clobber several registers we may want to use and has a
+specific expected stack layout, our machine code would need to interface with
+that calling convention. That bloats the target code and makes it harder to
+write our JIT compiler.
+
+The alternative would be to inline the primitive operations into the code we
+generate for user functions, so that there's no separation and no need to adhere
+to the C calling convention:
+
+```
+Driver (parser/compiler)                    C
++-- User-defined code + Primitives          x64
+```
+
+This leaves us with the burden of understanding the encoding of more
+instructions than we would need to know for the first approach, but it's a
+worthy trade-off to keep our compiler simple.
+
+The instructions that we're going to need here are:
+
+- 32-bit addition, subtraction, multiplication, division and modulo
+- Loading constant values
+- Pushing and popping values from the data stack
+
+One quirk of a lot of CPU architectures is that division and modulo are fused
+into one operation that calculates both. That means that we really just need
+to know how to do division, and how to pull out the division result or the
+remainder result as need be.
+
+In addition to knowing the instructions involved, we'll also have to pick out
+registers so that we know what the operations will be using. Since we'll be
+going to this level of detail it will make sense to write out each primitive
+operation as a series of x64 assembly instructions and then hand-compile them
+to figure out what should go in our program.
+
+The first simple operation is addition which takes two values off of the stack,
+adds them, and then pushes the result back onto the stack. At this point we've
+left the idea of "the stack" implicit, but we'll have to decide now where the
+stack is actually going to live. We'll also have to decide where to keep the
+current stack offset pointer. 
+
+%rdi and %rsi are good choices for the stack top and offset because they will
+be easy to fill directly from the driver-side C code. The C calling convention
+on Linux specifies that the first argument goes into %rdi and the second into
+%rsi, so it will be easy to call a UDF from C: 
+`((void(*)(int*)) udf_code)(stack_top, stack_idx)`
+
+Keeping that in mind, we'll write addition this way:
+
+```
+decl %esi
+movl (%rdi, %rsi, 4), %eax
+decl %esi
+movl (%rdi, %rsi, 4), %edx
+addl %eax, %edx
+movl %edx, (%rdi, %rsi, 4)
+incl %esi
+```
+
+In x64 instructions that operate on 64-bit operands require a special prefix
+called the RAX prefix. To avoid having to write that down, we use 32-bit
+increments and decrements when managing the stack offset. This is safe to do
+partially because the stack offset is not very large, and partially because
+x64 will clear the upper 32-bits of a 64-bit register when you write to the
+bottom 32-bits. In this case, decrementing %esi will clear the upper 32-bits
+of %rsi
+
+Now we have to compile this into machine code. This will require some knowledge
+of not only how instructions like dec and mov are encoded, but also about the
+scaled addressing scheme we're using here. The main reference for all these
+encodings is the IA-64 Architecture Manual which you can get from Intel. I won't
+go into the details of these encodings, but suffice to say they were by far the
+most difficult part to decipher!
+
+```
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+01 C2    // ADD ModRM(%edx, %eax)
+89 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+Subtraction is very similar to addition:
+
+```
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+29 C2    // SUB ModRM(%edx, %eax)
+89 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+Multiplication:
+
+```
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+0F AF D0 // IMUL ModRM(%edx, %eax)
+89 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+Signed division is special because there are no two argument forms like the
+other operations. There's only one argument which serves as the second half
+of the division, with the first half and the result/remainder going into
+fixed registers.
+
+That means that we're going to have to adjust our assembly template slightly
+to ensure that edx is clear:
+
+```
+decl %esi
+movl (%rdi, %rsi, 4), %ecx
+decl %esi
+movl (%rdi, %rsi, 4), %eax
+xorl %edx, %edx
+idiv %ecx
+movl %eax, (%rdi, %rsi, 4)
+incl %esi
+```
+
+Which we can encode as:
+
+```
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 0C B7 // MOV ModRM(%ecx) SIB(%rdi + 4*%rsi)
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+31 D2    // XOR ModRM(%edx, %edx)
+F7 F9    // IDIV ModRM(%ecx, opcode: 7)
+89 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+Modulo is almost the same as division, except that we take the opposite result
+register:
+
+```
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 0C B7 // MOV ModRM(%ecx) SIB(%rdi + 4*%rsi)
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+31 D2    // XOR ModRM(%edx, %edx)
+F7 F9    // IDIV ModRM(%ecx, opcode: 7)
+89 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+The stack operations are all very simple, and in fact we've seen most of them at
+this point. First, drop:
+
+```
+FF CE      // DEC ModRM(%esi, opcode: 1)
+```
+
+Then dup:
+
+```
+FF CE    // DEC ModRM(%esi, opcode: 1)
+8B 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+89 14 B7 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+And finally swap, which we'll implement like this:
+
+```
+8B 54 B7 FC // MOV ModRM(%edx) SIB(%rdi + 4*%rsi) - 4
+8B 44 B7 F8 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi) - 8
+89 44 B7 FC // MOV ModRM(%eax) SIB(%rdi + 4*%rsi) - 4
+89 54 B7 F8 // MOV ModRM(%edx) SIB(%rdi + 4*%rsi) - 8
+```
+
+We also have to implement pushing 32-bit integer constants, which we'll do like
+this:
+
+```
+B8 XX XX XX XX // MOV(%eax) IMM32(x)
+89 04 B7 // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+FF C6    // INC ModRM(%esi, opcode: 0)
+```
+
+# Calling Conventions
+
+With all of the "direct" primitives down, we can focus on the two remaining hard
+cases: calling user-defined functions and implementing print. 
+
+Since we're implementing our own function calls, we'll have to decide on our own
+calling convention. We could borrow the one from C, but there's not much point
+since we don't have things like local variables or efficient register allocation
+which C's calling convention is explicitly designed to support. Instead, we can
+avoid most of that work by setting a couple of ground rules:
+
+- %rdi always contains the address of the top of the stack (the least recently
+  pushed value)
+
+- %rsi always contains the current stack offset in terms of integer cells
+
+## Encoding UDF
+
+If we keep to these rules by not using %rdi or %rsi except for these purposes
+then the calling convention is easy. Just use the native call and ret
+instructions to manage the return address and don't clobber %rdi or %rsi
+in functions. We satisfied the second of those rules in the above encodings
+so we just have to implement the first here:
+
+```
+movabs %rax, <udf-jit-address>
+call *%rax
+```
+
+Which assembles into:
+
+```
+48 B8 XX XX XX XX XX XX XX XX // REX.W MOVABS(%rax) <udf-jit-address-be>
+FF D0                         // CALL ModRM(%rax, opcode: 2)
+```
+
+On the other end, returning from a function is just the `ret` instruction, which
+is encoded as:
+
+```
+C3 // RET
+```
+
+## Encoding Print
+
+We're in a different situation with print because we have to deal with the C
+calling convention, regardless of how we handle it. To avoid bloating the
+generated code with the assembly version of the printing loop, we'll write that
+bit in C and pass the address of the function down. 
+
+That means that we'll need another reserved register. %rbp is a good choice for
+this purpose since it's usually used for implementing local variables, which our
+language doesn't support. Also, the C calling convention says that called
+functions must preserve it which means that there's one less register we have
+to take care of.
+
+Since %rbp is taken care of by the callee, all we have to do is save %rdi and
+%rsi. We don't even have to assign them different values. Because the print
+function will have the prototype `void print(int *stack, int offset)`, the stack
+will go in %rdi (where it already is) and the offset will go in %rsi (where it
+also already is).
+
+The only other concern we have to worry about here is stack alignment. When we
+call print the stack has to be aligned to 16 bytes, which won't automatically
+be the case if we just pushed %rdi and %rsi with no padding:
+
+```
+0  ------------ previous function alignment boundary
+8  return addr  (pushed implicitly by call)
+16 saved %rdi
+24 saved %rsi
+```
+
+We'll need an extra 8 bytes to get the right alignment:
+
+```
+0  ------------ previous function alignment boundary
+8  return addr  (pushed implicitly by call)
+16 saved %rdi
+24 saved %rsi
+32 padding
+```
+
+With all that in mind, this is what the complete call to print looks like:
+
+```
+pushq %rdi
+pushq %rsi
+subq %rsp, 8 
+call *%rbp
+addq %rsp, 8
+popq %rsi
+popq %rdi
+```
+
+After assembling, it looks like this:
+
+```
+57                            // PUSH(%rdi)
+56                            // PUSH(%rsi)
+48 83 EC 08                   // REX.W SUB ModRM(%rsp, opcode: 5)
+FF D5                         // CALL ModRM(%rax, opcode: 2)
+48 83 C4 08                   // REX.W ADD ModRM(%rsp, opcode: 0)
+5E                            // POP(%rsi)
+5F                            // POP(%rdi)
+```
+
+## Gluing Our Calling Convention With The C Calling Convention
+
+Because we made the choice to use %rbp for storing the address of print, we'll
+have to do a few things to make sure that C code can call our JIT output safely:
+
+- We'll have to preserve the %rbp from the outer code, since the C calling
+  convention requires it and we'll definitely be overwriting it
+
+- We'll have to take in the address of print via the third standard argument
+  register, %rdx, and move it into %rbp
+
+Since this stub is going to be the same everywhere, it makes sense to make it
+generic. That means that we'll actually be taking 4 arguments instead of 3:
+
+- The stack base address (%rdi)
+
+- The current stack offset (%rsi)
+
+- The address of the print function (%rdx)
+
+- The address of the JIT code to chain to (%rcx)
+
+We'll also have to get the current stack offset back out of this function, which
+just requires copying the stack offset into %eax and using it as our value.
+
+We can use this as our stub:
+
+```
+pushq %rbp
+movq %rdx, %rbp
+call *%rcx
+popq %rbp
+movq %rsi, %rax
+ret
+```
+
+Which turns into this after assembling:
+
+```
+55       // PUSH(%rbp)
+48 89 D5 // REX.W MOV ModRM(%rbp, %rdx)
+FF D1    // CALL ModRM(%rcx, opcode: 2)
+5D       // POP(%rbp)
+48 89 F0 // REX.W MOV ModRM(%rax, %rsi)
+C3       // RET
+```
+
+## Missing Padding?
+
+After the discussion of padding in the previous sections, one thing stands out
+about the UDF calling convention we established earlier. It doesn't ensure that
+the padding of the stack is preserved across calls.
+
+Consider what happens if we have a definition like this:
+
+```
+: myprint print
+: dbgsquare dup * myprint
+```
+
+If we invoke it then the call stack will start out 16-byte aligned at the end
+of our JIT transfer stub:
+
+```
+0   ---------- C code alignment boundary
+8   return addr
+16  saved %rbp
+```
+
+Then we'll get into dbqsquare and get out of alignment. Everything's going to
+plan so far: if we called print directly within dbqsquare, the padding we add
+would align us properly again. 
+
+```
+0   ---------- C code alignment boundary
+8   return addr
+16  saved %rbp
+24  stub addr
+```
+
+Instead we call myprint and suddenly we're back in alignment again! At this
+point it's clear what the problem is: if we want to ensure that we have a
+specific alignment at one point, we have to preserve that alignment all
+the way to that point. Otherwise different code paths could put the stack
+in and out of alignment at times we don't expect.
+
+```
+0   ---------- C code alignment boundary
+8   return addr
+16  saved %rbp
+24  stub addr
+32  dbgsquare addr
+```
+
+The easy way to fix this is to advance the stack pointer at the start of
+every user-defined function and fix it before we return. This means that
+our function prologue (which we didn't previously have before!) looks like
+this:
+
+```
+48 83 EC 08 // REX.W SUB ModRM(%rsp, opcode: 5)
+```
+
+Our return code looks like this:
+
+```
+48 83 C4 08 // REX.W ADD ModRM(%rsp, opcode: 0)
+C3          // RET
+```
+
+This simplifies the code we use to call other functions though. Now that
+the stack is properly aligned by the function prologue, each individual
+call doesn't have to worry about setting up the alignment on its own:
+
+```
+57    // PUSH(%rdi)
+56    // PUSH(%rsi)
+FF D5 // CALL ModRM(%rbp, opcode: 2)
+5E    // POP(%rsi)
+5F    // POP(%rdi)
+```
+
+Let's run through the stack mapping exercise one more time to make sure
+that this change does actually fix things:
+
+```
+0   ---------- C code alignment boundary
+8   return addr
+16  saved %rbp
+24  stub addr
+32  padding
+40  dbgsquare addr
+48  padding
+```
+
+Success!
+
+# Invoking JIT Code At Runtime
+
+At this point the last piece of our puzzle is how to figure out how to store
+this code in such a way that we can execute it. The obvious solution would be to
+use stack memory:
+
+```
+#include <stdio.h>
+
+void write_machinecode(char *buffer) {
+    // MOVL %eax, 42 | MOV(%eax) IMM32(42)
+    //   B8 2A 00 00 00
+    // RET
+    //   C3
+    buffer[0] = 0xb8;
+    buffer[1] = 0x2a;
+    buffer[2] = 0x00;
+    buffer[3] = 0x00;
+    buffer[4] = 0x00;
+    buffer[5] = 0xc3;
+}
+
+int main() {
+    char code[6];
+    write_machinecode(code);
+
+    int (*fptr)() = (int (*)()) code;
+    int x = (*fptr)();
+    printf("code = %d\n", x);
+}
+```
+
+The problem is that this doesn't work if you try it:
+
+```
+$ gcc rpn4.c -o rpn4
+$ ./rpn4
+segmentation fault
+```
+
+The reason for this failure is due to a policy known as
+[W^X](https://en.wikipedia.org/wiki/W%5EX). W^X is meant to avoid attacks where
+somebody can exploit a buffer overflow by writing some machine code (or
+shellcode as the security community calls it) to the stack and then run that
+code. The reason this interferes here is that we're taking that exact strategy
+to run our own code, minus the part about it being an exploit.
+
+The usual way to avoid this is to use something called mprotect, which lets us
+set permissions on a chunk of memory that we own. We can't do it with malloc
+because the permissions are on the level of a memory page, and malloc won't
+necessarily hand us back our own page. Instead we'll have to use mmap:
+
+```
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+void write_machinecode(char *buffer) {
+    // MOVL %eax, 42 | MOV(%eax) IMM32(42)
+    //   B8 2A 00 00 00
+    // RET
+    //   C3
+    buffer[0] = 0xb8;
+    buffer[1] = 0x2a;
+    buffer[2] = 0x00;
+    buffer[3] = 0x00;
+    buffer[4] = 0x00;
+    buffer[5] = 0xc3;
+}
+
+int main() {
+    char *code = mmap(NULL, 6, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    write_machinecode(code);
+
+    int (*fptr)() = (int (*)()) code;
+    int x = (*fptr)();
+    printf("code = %d\n", x);
+}
+```
+
+Note that we map the memory with full permissions upfront. This is generally
+considered bad practice since that defeats the point of W^X (if someone can
+get a reference to the page, they can run arbitrary code) but we're going
+to ignore that concern for brevity.
+
+# Bringing It All Together
+
+Now that we have all the pieces in place, we can bring them together into the
+final result: a programmable RPN calculator backed by a JIT compiler:
+
+```
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#define LINE_SZ 4096
+#define COMMAND_SZ 128
+#define COMMAND_CNT 32
+
+typedef enum { WORD_NONE, WORD_INT, WORD_COMMAND } wordtype_t;
+
+typedef struct {
+  bool used;
+  char name[COMMAND_SZ];
+  unsigned char *defn;
+} userdef_t;
+
+void print(int *stack_top, int stack_offset) {
+  for (int i = 0; i < stack_offset; i++) {
+    printf("[%d] = %d\n", i, stack_top[i]);
+  }
+}
+
+int compile_command(userdef_t *defs, char *command, unsigned char *code_base,
+                    int code_idx) {
+  for (int i = 0; i < COMMAND_CNT; i++) {
+    if (defs[i].used && !strcmp(command, defs[i].name)) {
+        // REX.W MOVABS(%rax) <udf-address>
+        code_base[code_idx++] = 0x48;
+        code_base[code_idx++] = 0XB8;
+
+        *((long *)(code_base + code_idx)) = (long)defs[i].defn;
+        code_idx += 8;
+
+        // CALL *%rax
+        code_base[code_idx++] = 0XFF;
+        code_base[code_idx++] = 0XD0;
+        return code_idx;
+    }
+  }
+
+  if (!strcmp(command, "+")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x04;
+    code_base[code_idx++] = 0xB7;
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // ADD ModRM(%edx, %eax)
+    code_base[code_idx++] = 0x01;
+    code_base[code_idx++] = 0xC2;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    return code_idx;
+  } else if (!strcmp(command, "-")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x04;
+    code_base[code_idx++] = 0xB7;
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // SUB ModRM(%edx, %eax)
+    code_base[code_idx++] = 0x29;
+    code_base[code_idx++] = 0xC2;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    return code_idx;
+  } else if (!strcmp(command, "*")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x04;
+    code_base[code_idx++] = 0xB7;
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // IMUL ModRM(%edx, %eax)
+    code_base[code_idx++] = 0x0F;
+    code_base[code_idx++] = 0xAF;
+    code_base[code_idx++] = 0xD0;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    return code_idx;
+  } else if (!strcmp(command, "/")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%ecx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x0C;
+    code_base[code_idx++] = 0xB7;
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x04;
+    code_base[code_idx++] = 0xB7;
+    // XOR ModRM(%edx, %edx)
+    code_base[code_idx++] = 0x31;
+    code_base[code_idx++] = 0xD2;
+    // IDIV ModRM(%ecx, opcode: 7)
+    code_base[code_idx++] = 0xF7;
+    code_base[code_idx++] = 0xF9;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x04;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    return code_idx;
+  } else if (!strcmp(command, "%")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%ecx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x0C;
+    code_base[code_idx++] = 0xB7;
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x04;
+    code_base[code_idx++] = 0xB7;
+    // XOR ModRM(%edx, %edx)
+    code_base[code_idx++] = 0x31;
+    code_base[code_idx++] = 0xD2;
+    // IDIV ModRM(%ecx, opcode: 7)
+    code_base[code_idx++] = 0xF7;
+    code_base[code_idx++] = 0xF9;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    return code_idx;
+  } else if (!strcmp(command, "swap")) {
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi) - 4
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x54;
+    code_base[code_idx++] = 0xB7;
+    code_base[code_idx++] = 0xFC;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi) - 8
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x44;
+    code_base[code_idx++] = 0xB7;
+    code_base[code_idx++] = 0xF8;
+    // MOV ModRM(%eax) SIB(%rdi + 4*%rsi) - 4
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x44;
+    code_base[code_idx++] = 0xB7;
+    code_base[code_idx++] = 0xFC;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi) - 8
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x54;
+    code_base[code_idx++] = 0xB7;
+    code_base[code_idx++] = 0xF8;
+    return code_idx;
+  } else if (!strcmp(command, "dup")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x8B;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    // MOV ModRM(%edx) SIB(%rdi + 4*%rsi)
+    code_base[code_idx++] = 0x89;
+    code_base[code_idx++] = 0x14;
+    code_base[code_idx++] = 0xB7;
+    // INC ModRM(%esi, opcode: 0)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xC6;
+    return code_idx;
+  } else if (!strcmp(command, "drop")) {
+    // DEC ModRM(%esi, opcode: 1)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xCE;
+    return code_idx;
+  } else if (!strcmp(command, "print")) {
+    // PUSH(%rdi)
+    code_base[code_idx++] = 0x57;
+    // PUSH(%rsi)
+    code_base[code_idx++] = 0x56;
+    // CALL ModRM(%rbp, opcode: 2)
+    code_base[code_idx++] = 0xFF;
+    code_base[code_idx++] = 0xD5;
+    // POP(%rsi)
+    code_base[code_idx++] = 0x5E;
+    // POP(%rdi)
+    code_base[code_idx++] = 0x5F;
+    return code_idx;
+  } else {
+    // REX.W ADD ModRM(%rsp, opcode: 0)
+    code_base[code_idx++] = 0x48;
+    code_base[code_idx++] = 0x83;
+    code_base[code_idx++] = 0xC4;
+    code_base[code_idx++] = 0x08;
+    // RET
+    code_base[code_idx++] = 0xC3;
+    return code_idx;
+  }
+}
+
+void compile_line(userdef_t *defs, unsigned char *code_base, char *line, int cmd_start) {
+  bool line_done = false;
+  bool word_done = false;
+  wordtype_t word_type = WORD_NONE;
+
+  char command[128];
+  int command_idx = 0;
+  int number = 0;
+  int code_idx = 0;
+
+  // REX.W SUB ModRM(%rsp, opcode: 5)
+  code_base[code_idx++] = 0x48;
+  code_base[code_idx++] = 0x83;
+  code_base[code_idx++] = 0xEC;
+  code_base[code_idx++] = 0x08;
+
+  int i = cmd_start;
+  while (!line_done) {
+    char ch = line[i];
+
+    if (!ch) {
+      line_done = true;
+      word_done = true;
+    } else if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
+      word_done = true;
+    } else if (ch >= '0' && ch <= '9') {
+      word_type = WORD_INT;
+      number = (number * 10) + (ch - '0');
+    } else {
+      word_type = WORD_COMMAND;
+      command[command_idx++] = ch;
+    }
+
+    if (word_done) {
+      switch (word_type) {
+      case WORD_INT:
+        // MOV(%eax) IMM32(x)
+        code_base[code_idx++] = 0xB8;
+        *((int*) (code_base + code_idx)) = number;
+        code_idx += 4;
+        // MOV ModRM(%eax) SIB(%rdi + 4*%rsi)
+        code_base[code_idx++] = 0x89;
+        code_base[code_idx++] = 0x04;
+        code_base[code_idx++] = 0xB7;
+        // INC ModRM(%esi, opcode: 0)
+        code_base[code_idx++] = 0xFF;
+        code_base[code_idx++] = 0xC6;
+        break;
+      case WORD_COMMAND: {
+        command[command_idx++] = 0;
+        code_idx = compile_command(defs, command, code_base, code_idx);
+        break;
+      }
+      case WORD_NONE:
+        break;
+      }
+
+      number = 0;
+      command_idx = 0;
+      word_type = WORD_NONE;
+      word_done = false;
+    }
+
+    i++;
+  }
+
+  // REX.W ADD ModRM(%rsp, opcode: 0)
+  code_base[code_idx++] = 0x48;
+  code_base[code_idx++] = 0x83;
+  code_base[code_idx++] = 0xC4;
+  code_base[code_idx++] = 0x08;
+  // RET
+  code_base[code_idx++] = 0xC3;
+}
+
+int main() {
+  char line[LINE_SZ];
+  char def[COMMAND_SZ];
+  int stack[4096];
+  int stack_offset = 0;
+
+  unsigned char* glue_page = mmap(NULL, 11, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int glue_offset = 0;
+  // PUSH(%rbp)
+  glue_page[glue_offset++] = 0x55;
+  // REX.W MOV ModRM(%rbp, %rdx)
+  glue_page[glue_offset++] = 0x48;
+  glue_page[glue_offset++] = 0x89;
+  glue_page[glue_offset++] = 0xD5;
+  // CALL ModRM(%rcx, opcode: 2)
+  glue_page[glue_offset++] = 0xFF;
+  glue_page[glue_offset++] = 0xD1;
+  // POP(%rbp)
+  glue_page[glue_offset++] = 0x5D;
+  // REX.W MOV ModRM(%rax, %rsi)
+  glue_page[glue_offset++] = 0x48;
+  glue_page[glue_offset++] = 0x89;
+  glue_page[glue_offset++] = 0xF0;
+  // RET
+  glue_page[glue_offset++] = 0xC3;
+  int (*glue_func)(int*, int, void(*)(int*, int), unsigned char*) = (void*) glue_page;
+
+  unsigned char *interp_page = mmap(NULL, LINE_SZ, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+  userdef_t userdefs[COMMAND_CNT] = {0};
+  char line_defn[LINE_SZ];
+
+  while (fgets(line, LINE_SZ, stdin) != NULL) {
+    if (line[0] == ':') {
+      int def_target;
+      for (def_target = 0; def_target < COMMAND_CNT; def_target++) {
+        if (!userdefs[def_target].used)
+          break;
+      }
+
+      userdefs[def_target].used = true;
+      userdefs[def_target].defn = mmap(NULL, LINE_SZ, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      int start_cmd;
+      int end_cmd;
+
+      for (int i = 1; i < LINE_SZ; i++) {
+        if (line[i] != ' ' && line[i] != '\n' && line[i] != '\r' &&
+            line[i] != '\t') {
+          start_cmd = i;
+          break;
+        }
+      }
+
+      for (int i = start_cmd; i < LINE_SZ; i++) {
+        if (line[i] == ' ' || line[i] == '\n' || line[i] == '\r' ||
+            line[i] == '\t') {
+          end_cmd = i;
+          break;
+        } else {
+          userdefs[def_target].name[i - start_cmd] = line[i];
+        }
+      }
+
+      compile_line(userdefs, userdefs[def_target].defn, line, end_cmd);
+    } else {
+      compile_line(userdefs, interp_page, line, 0);
+      stack_offset = (*glue_func)(stack, stack_offset, print, interp_page);
+    }
+  }
+
+  return 0;
+}
+```
+
+It doesn't add any extra capabilities, but knowing how it works internally it's
+cool just to watch it function:
+
+```
+$ gcc -g rpn5.c -o rpn5
+$ ./rpn5
+: square dup *
+6 square
+print
+[0] = 36
+```
+
+It's even more interesting single-stepping through the generated assembly in
+GDB; it's entirely clear that we've left C land by the fact that GDB can't even
+decipher our calling convention:
+
+```
+$ gdb ./rpn5
+(gdb) b 392
+(gdb) r
+: square dup *
+6 square print
+
+Breakpoint 1, main () at rpn5.c:392
+392           stack_offset = (*glue_func)(stack, stack_offset, print, interp_page);
+(gdb) layout asm
+(gdb) si # Repeat this until you're past the callq instruction
+(gdb) bt
+#0  0x00007ffff7fcf000 in ?? ()
+#1  0x0000555555556622 in main () at rpn5.c:392
+(gdb) si # Continue until you're past the next callq instruction
+(gdb) bt
+0x00007ffff7fcf001 in ?? ()
+0x00007ffff7fcf004 in ?? ()
+0x00007ffff7fce000 in ?? ()
+(gdb) q
+```
+
+# Performance
+
+Looking back, we've certainly ended up with a more complex program here than we
+started with.  The justification for this increase in complexity has been
+performance, that at each level by  trading code simplicity for more efficient
+repesentations we could run user programs more efficiently.
+
+The question is, has that actually happened? Or have we been deluding ourselves?
+
+First we need a good test case. I'm going to be taking another page out of the
+security playbook here and make a highly recursive test based upon the
+[Billion Laughs Attack](https://en.wikipedia.org/wiki/Billion_laughs_attack).
+The core idea of this test case is that by combining several layers of recursion
+we can easily run the same test millions of times without having to write all
+of those invocations:
+
+```
+$ cat stress.txt
+: stress 1 1 + drop
+: stressA stress stress stress stress stress stress stress stress stress stress
+: stressB stressA stressA stressA stressA stressA stressA stressA stressA stressA stressA
+: stressC stressB stressB stressB stressB stressB stressB stressB stressB stressB stressB
+: stressD stressC stressC stressC stressC stressC stressC stressC stressC stressC stressC
+: stressE stressD stressD stressD stressD stressD stressD stressD stressD stressD stressD
+: stressF stressE stressE stressE stressE stressE stressE stressE stressE stressE stressE
+: stressG stressF stressF stressF stressF stressF stressF stressF stressF stressF stressF
+: stressH stressG stressG stressG stressG stressG stressG stressG stressG stressG stressG
+stressH
+```
+
+First, let's run it against the naive interpreter. We'll recompile it with full
+optimizations to give it a fighting chance:
+
+```
+$ gcc -O3 rpn2.c -o rpn2
+$ time rpn2 < stress.txt
+./rpn2 < stress.txt  23.07s user 0.00s system 99% cpu 23.101 total
+```
+
+Ouch, this test pegs one of the cores of my i5-6500 for over 23 seconds! This
+leaves a lot of room for improvement with the bytecode interpreter:
+
+```
+$ gcc -O3 rpn3.c -o rpn3
+$ time rpn3 < stress.txt
+./rpn3 < stress.txt  1.28s user 0.00s system 99% cpu 1.282 total
+```
+
+This is already a substantial improvment from the naive interpreter. It turns
+out that parsing and reparsing all those command lines really has a cost!
+
+Finally, the JIT compiler:
+
+```
+$ gcc -O3 rpn5.c -o rpn5
+$ time rpn5 < stress.txt
+./rpn5 < stress.txt  0.19s user 0.00s system 99% cpu 0.192 total
+```
+
+I had to double check this to make sure that it was actually running the code I
+had provided. A fraction of a second seems way too fast for doing this operation
+100 million times. However, after a trip through GDB this time does indeed check
+out - it performs the sum and drops the result each time.
+
+Just to give an overview of this speed difference, we can line up the numbers
+together to see how dramatic the result is:
+
+- rpn2: 23.101s / 1.000x
+- rpn3:  1.282s / 0.055x
+- rpn5:  0.192s / 0.008x
No results found