From b0a387d5a1d58da53918222112200c9c0f8a1ad0 Mon Sep 17 00:00:00 2001
From: Peter Fidelman <peter.s.fidelman>
Date: Sun, 29 May 2022 11:18:35 -0700
Subject: [PATCH] Replace frustration.rs with literate-programmed version that
 matches the README.  Use idiomatic Rust constructors.

---
 README.md      |   44 +-
 frustration.rs | 3196 ++++++++++++++++++++++++++----------------------
 2 files changed, 1765 insertions(+), 1475 deletions(-)
diff --git a/README.md b/README.md
index 826376c..1b45266 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,18 @@ out with print!().
         self.tos = (self.tos.wrapping_sub(1)) & (N - 1);
         return val;
     }
+```
+
+Finally, here is a function that creates a new stack.
+Because these are circular stacks it doesn't matter where top-of-stack
+(tos) starts off pointing.  I arbitrarily set it to the highest index so
+the first value pushed will wind up at index 0, again because this
+makes the stack look nicer when printed out.
+
+```rust
+    fn new() -> Stack<N> {
+        return Stack {tos: N-1, mem: [0; N]};
+    }
 }
 ```
 ### Designing a stack CPU
@@ -243,25 +255,21 @@ struct Core {
 }
 ```
 
-Now for a helper to initialize the CPU.
+Finally, let's write a function that creates and returns a CPU for us to use.
 
 ```rust
-fn new_core() -> Core {
-    let c = Core {
-        ram: [0; ADDRESS_SPACE],
-        ip: 0,
-        dstack: Stack {tos: 15, mem: [0; 16]},
-        rstack: Stack {tos: 31, mem: [0; 32]}};
+use std::convert::TryInto;
 
-    return c;
-}
+impl Core {
+    fn new() -> Core {
+        return Core {
+            ram: [0; ADDRESS_SPACE],
+            ip: 0,
+            dstack: Stack::new(),
+            rstack: Stack::new()}
+    }
 ```
 
-Because these are circular stacks it doesn't matter where top-of-stack
-(tos) starts off pointing.  I arbitrarily set it to the highest index so
-the first value pushed will wind up at index 0, again because this
-makes the stack look nicer when printed out.
-
 ## 1.1 - The instruction set
 
 Now we have a CPU sitting there but it does nothing.
@@ -290,13 +298,9 @@ So now we will make the CPU do those things.
 We'll start off by teaching it how to access memory, and then we will
 define the instruction set.
 
-```rust
-impl Core {
-```
-
 ### Memory access
 
-Start with a helper to read a number from the specified memory address.
+Now let's write a function to read a number from the specified memory address.
 
 ```rust
     fn load(&self, addr: u16) -> u16 {
@@ -2642,7 +2646,7 @@ Finally, start the machine.
 ```rust
 fn main() {
     /* Create the machine */
-    let mut c = new_core();
+    let mut c = Core::new();
 
     /* Put the dictionary into memory */
     build_dictionary(&mut c);
diff --git a/frustration.rs b/frustration.rs
index 0f13117..7c016fe 100644
--- a/frustration.rs
+++ b/frustration.rs
@@ -1,106 +1,158 @@
-/* Project URL: https://gitlab.cs.washington.edu/fidelp/frustration */
+//@ Project URL: https://gitlab.cs.washington.edu/fidelp/frustration
+//@
+//@ Frustration - Escaping a Turing Tar Pit with Forth
+//@
+//@ # What is this file?
+//@
+//@ This is a tutorial that will show you how to bootstrap an interactive
+//@ programming environment from a small amount of code.
+//@
+//@ First we will design a virtual computer.
+//@
+//@ Then we will design software to run on that computer, to enable REPL-style
+//@ interactive programming.
+//@
+//@ A REPL is a
+//@ "[Read, Evaluate, Print loop](https://en.wikipedia.org/wiki/Repl)".
+//@ A REPL lets you type code at
+//@ the keyboard and immediately get a result back.  You can also define
+//@ functions, including functions that change how the environment works in
+//@ fundamental ways.
+//@
+//@ # What is Forth?
+//@
+//@ Forth is the programming language we will use with our computer.
+//@
+//@ Forth was invented by Chuck Moore in the 1960s as a tool for quickly
+//@ coming to grips with new computer systems.
+//@
+//@ > "Let us imagine a situation in which you have access to
+//@ > your computer. I mean sole user sitting at the board with
+//@ > all the lights, for some hours at a time. This is
+//@ > admittedly an atypical situation, but one that can
+//@ > always be arranged if you are competent, press hard, and
+//@ > will work odd hours. Can you and the computer write a
+//@ > program? Can you write a program that didn't descend from
+//@ > a pre-existing program? You can learn a bit and have a
+//@ > lot of fun trying."
+//@ > 
+//@ > -- Chuck Moore,
+//@ > ["Programming a Problem-Oriented Language"](https://colorforth.github.io/POL.htm),
+//@ > 1970
+//@
+//@ As you will see, it does not take much work to get Forth running on a
+//@ new machine, including a machine with a completely unfamiliar instruction
+//@ set.
+//@
+//@ But before we can do any of that we will need a machine.  Let's make one.
+//@
+//@ # Table of Contents
+//@ - Part 1 - The Computer
+//@   - 1.0 - Designing the CPU
+//@     - Defining a stack
+//@     - Designing a stack CPU
+//@   - 1.1 - The instruction set
+//@     - Memory access
+//@     - Designing the instruction set
+//@       - The CALL instruction
+//@       - Data processing instructions
+//@       - The LITERAL instruction
+//@     - Making the CPU run
+//@       - Return-stack instructions
+//@       - Memory instructions
+//@       - Stack shuffling instructions
+//@       - Conditional skip instruction
+//@       - Arithmetic and logic
+//@       - Input/output
+//@ - Part 2 - The Program
+//@     - Designing the Forth dictionary
+//@     - Tools for building the Forth dictionary
+//@     - Building the Forth dictionary
+//@       - Subroutine threading
+//@       - key
+//@       - emit
+//@       - subtraction
+//@       - 0= (compare-to-zero)
+//@       - = (equals)
+//@   - 2.1 - The lexer
+//@     - Skipping whitespace
+//@     - Reading characters into a buffer
+//@       - over
+//@       - 2dup
+//@       - The input buffer
+//@       - min
+//@       - c@ and c! (byte-by-byte memory access)
+//@       - Filling the input buffer
+//@       - word
+//@   - 2.2 - Dictionary lookup
+//@       - latest
+//@       - find
+//@       - ' (quote)
+//@   - 2.3 - The outer interpreter
+//@       - here
+//@       - Achieving interactivity
+//@       - immediate
+//@       - [ and ]
+//@       - smudge and unsmudge
+//@       - , (comma)
+//@       - number
+//@       - literal
+//@   - 2.4 - Defining subroutines
+//@     - create
+//@     - : (define word)
+//@     - ; (end of definition)
+//@   - Miscellanea
+//@ - Part 3 - Using the interactive programming environment
+//@
+//@ # Part 1 - The Computer
+//@
+//@ ## 1.0 - Designing the CPU
+//@
+//@ This computer will have a 16-bit CPU.  It will be able to access
+//@ 2^16 (65536) memory locations, numbered 0 to 65535.
+//@ Each of these locations, 0 to 65535, is called a "memory address".
 
-use std::io;
-use std::io::Read;
-use std::io::Write;
-use std::convert::TryInto;
-
-/* What is this file?
- *
- * This is a tutorial that will show you how to bootstrap an interactive
- * programming environment from a small amount of code.
- *
- * First we will design a virtual computer.
- *
- * Then we will design software to run on that computer, to enable REPL-style
- * interactive programming.
- *
- * A REPL is a "Read, Evaluate, Print loop".  A REPL lets you type code at
- * the keyboard and immediately get a result back.  You can also define
- * functions, including functions that change how the environment works in
- * fundamental ways.
- */
-
-/* What is Forth?
- *
- * Forth is the programming language we will use with our computer.
- *
- * Forth was invented by Chuck Moore in the 1960s as a tool for quickly
- * coming to grips with new computer systems.
- *
- * "Let us imagine a situation in which you have access to
- * your computer. I mean sole user sitting at the board with
- * all the lights, for some hours at a time. This is
- * admittedly an atypical situation, but one that can
- * always be arranged if you are competent, press hard, and
- * will work odd hours. Can you and the computer write a
- * program? Can you write a program that didn't descend from
- * a pre-existing program? You can learn a bit and have a
- * lot of fun trying."
-
- * -- Chuck Moore, "Programming a Problem-Oriented Language", 1970
- *    https://colorforth.github.io/POL.htm
- *
- * As you will see, it does not take much work to get Forth running on a
- * new machine, including a machine with a completely unfamiliar instruction
- * set.
- *
- * But before we can do any of that we will need a machine.  Let's make one.
- */
-
-/* Table of Contents
- *      Part 1 - The Computer
- *          Part 1a - The instruction set
-
- *      Part 2 - The Program
- *          Part 2a - The lexer
- *          Part 2b - The outer interpreter
-
- *      Part 3 - Using the interactive programming environment
- */
-
-/* ---------------------------------------------------------------------------
- *                           Part 1 - The Computer
- * ------------------------------------------------------------------------- */
-
-/* This computer will have a 16-bit CPU.  It will be able to access
- * 2^16 (65536) memory locations, numbered 0 to 65535.
- * Each of these locations, 0 to 65535, is called a "memory address".
- */
 const ADDRESS_SPACE: usize = 65536;
 
-/* The job of a CPU is to load numbers from memory, do math or logic on them,
- * then write the resulting number back into memory.
- *
- * The CPU needs a temporary place to hold numbers while it is working with
- * them.
- *
- * In most CPUs, this place is called a "register".  Registers work like
- * variables in a programming language but there are only a few of them
- * (most CPUs have between 1 and 32).
- *
- * On 64-bit ARM the registers are named  r0, r1, ..., r15.
- * On 64-bit Intel they are instead named rax, rbx, ....
- * Just in case those names ring any bells.
- *
- * Having immediate access to dozens of registers is quite handy, but it means
- * many choices are available to the programmer, or more likely, to the
- * compiler.  And making good choices is Hard.  A lot of work goes into
- * deciding what variable to store in what register ("register allocation") and
- * when to dump register contents back into memory ("spilling").
- *
- * Our CPU avoids these problems by not having registers; instead we store
- * numbers in a stack.
- * - Putting a number onto the top of the stack is called "push".
- * - Taking the most recent number off the top of the stack is called "pop".
- *
- * The CPU can only access the value that was most recently pushed onto the
- * stack.  This may seem like a big limitation right now but you will see ways
- * of dealing with it.
- *
- * The choice to use a stack instead of registers makes our CPU a
- * "stack machine" as opposed to a "register machine".
- */
+//@ The job of a CPU is to load numbers from memory, do math or logic on them,
+//@ then write the resulting number back into memory.
+//@
+//@ The CPU needs a temporary place to hold numbers while it is working with
+//@ them.
+//@
+//@ In most CPUs, this place is called a "register".  Registers work like
+//@ variables in a programming language but there are only a few of them
+//@ (most CPUs have between 1 and 32).
+//@
+//@ - On 64-bit ARM the registers are named  r0, r1, ..., r15.
+//@ - On 64-bit Intel they are instead named rax, rbx, ....
+//@
+//@ Just in case those names ring any bells.
+//@
+//@ Having immediate access to dozens of registers is quite handy, but it means
+//@ many choices are available to the programmer, or more likely, to the
+//@ compiler.  And making good choices is Hard.  A lot of work goes into
+//@ deciding what variable to store in what register
+//@ ("[register allocation](https://en.wikipedia.org/wiki/Register_allocation)")
+//@ and when to dump register contents back into memory ("spilling").
+//@
+//@ Our CPU avoids these problems by not having registers; instead we store
+//@ numbers in a stack.
+//@
+//@ - Putting a number onto the top of the stack is called "push".
+//@ - Taking the most recent number off the top of the stack is called "pop".
+//@
+//@ The CPU can only access the value that was most recently pushed onto the
+//@ stack.  This may seem like a big limitation right now but you will see ways
+//@ of dealing with it.
+//@
+//@ The choice to use a stack instead of registers makes our CPU a
+//@ "[stack machine](https://en.wikipedia.org/wiki/Stack_machine)"
+//@ as opposed to a "register machine".
+//@
+//@ ### Defining a stack
+//@
+//@ This stack is fixed-size and can hold N values.
 
 #[derive(Debug)]
 struct Stack<const N: usize> {
@@ -108,314 +160,345 @@ struct Stack<const N: usize> {
     tos: usize  /* top-of-stack */
 }
 
+//@ First we'll need a function to add a number to the stack.
+//@
+//@ When a fixed-size stack fills up, there is a failure case
+//@ (stack overflow) that must be handled somehow.
+//@
+//@ This particular stack is a circular stack, meaning that if
+//@ it ever fills up, it will discard the oldest entry instead of
+//@ signaling an error.  The lack of error handling makes the CPU
+//@ simpler.
+
 impl<const N: usize> Stack<N> {
-    /* Add a number to the stack. */
     fn push(&mut self, val: u16) {
         self.tos = (self.tos.wrapping_add(1)) & (N - 1);
-
-        /* This stack is fixed-size and can hold N values.
-         *
-         * When a fixed-size stack fills up, there is a failure case
-         * (stack overflow) that must be handled somehow.
-         *
-         * This particular stack is a circular stack, meaning that if
-         * it ever fills up, it will discard the oldest entry instead of
-         * signaling an error.  The lack of error handling makes the CPU
-         * simpler.
-         */
-
         self.mem[self.tos] = val;
     }
 
-    /* Return the most recently pushed number. */
+//@ We'll also need a function to remove & return the most recently pushed
+//@ number.
+
     fn pop(&mut self) -> u16 {
         let val = self.mem[self.tos];
         self.mem[self.tos] = 0;
 
-        /* You don't have to set the value back to zero.  I am only doing
-         * this because it makes makes the stack look nicer when dumped
-         * out with print!().
-         */
+//@ You don't have to set the value back to zero.  I am only doing
+//@ this because it makes makes the stack look nicer when dumped
+//@ out with print!().
 
         self.tos = (self.tos.wrapping_sub(1)) & (N - 1);
         return val;
     }
-}
 
-/* Now that we have a stack let's use one!  Or two?
- *
- * Why two stacks?
- *
- * The first stack is called the "data stack" and is used instead of
- * registers, as already described.
- *
- * The second stack will be called the "return stack".  This one holds
- * subroutine return addresses.  Don't worry if you don't know what that
- * means; we'll get to it later when we talk about the instruction set.
- *
- * In addition to stacks we are going to give the CPU a couple more things:
- *
- * 1. An "instruction pointer", which holds the memory address of the next
- *    instruction that the CPU will execute.
- *
- * 2. To make life simpler we put main memory straight on "the CPU" even
- *    though in a real computer, RAM would be off-chip and accessed through a
- *    data bus.
- */
+//@ Finally, here is a function that creates a new stack.
+//@ Because these are circular stacks it doesn't matter where top-of-stack
+//@ (tos) starts off pointing.  I arbitrarily set it to the highest index so
+//@ the first value pushed will wind up at index 0, again because this
+//@ makes the stack look nicer when printed out.
+
+    fn new() -> Stack<N> {
+        return Stack {tos: N-1, mem: [0; N]};
+    }
+}
+//@ ### Designing a stack CPU
+//@
+//@ Now that we have a stack let's use one in our CPU!  Or two?
+//@
+//@ Why two stacks?
+//@
+//@ The first stack is called the "data stack" and is used instead of
+//@ registers, as already described.
+//@
+//@ The second stack will be called the "return stack".  This one holds
+//@ subroutine return addresses.  Don't worry if you don't know what that
+//@ means; we'll get to it later when we talk about the instruction set.
+//@
+//@ In addition to stacks we are going to give the CPU a couple more things:
+//@
+//@ 1. An "instruction pointer", which holds the memory address of the next
+//@    instruction that the CPU will execute.
+//@
+//@ 2. To make life simpler we put main memory straight on "the CPU" even
+//@    though in a real computer, RAM would be off-chip and accessed through a
+//@    data bus.
+//@
+//@ In our memory, each of the 65536 possible memory addresses will store one
+//@ 8-bit byte (u8 data type in Rust).  This makes it a 65536 byte (64 KB)
+//@ memory.  We could have chosen to make each memory address store 16-bits
+//@ instead.  That would make this a "word-addressed memory".  Instead we are
+//@ going with the
+//@ "[byte-addressed memory](https://en.wikipedia.org/wiki/Byte_addressing)"
+//@ that is more conventional in today's
+//@ computers.  This choice is arbitrary.
+//@
+//@ Let's add those things to the CPU.
 
 struct Core {
     ram: [u8; ADDRESS_SPACE],
-
-    /* In our memory, each of the 65536 possible memory addresses will store
-     * one 8-bit byte (u8 data type in Rust).  This makes it a 65536 byte
-     * (64 KB) memory.
-     *
-     * We could have chosen to make each memory address store 16-bits instead.
-     * That would make this a "word-addressed memory".
-     *
-     * Instead we are going with the "byte-addressed memory" that is more
-     * conventional in today's computers.  This choice is arbitrary.
-     */
-
     ip: u16,  /* instruction pointer */
     dstack: Stack<16>, /* data stack */
     rstack: Stack<32>  /* return stack */
 }
 
-/* Helper to initialize the CPU.
- * There is probably a better idiom for this but I am bad at rust */
-fn new_core() -> Core {
-    let c = Core {
-        ram: [0; ADDRESS_SPACE],
-        ip: 0,
-        dstack: Stack {tos: 15, mem: [0; 16]},
-        rstack: Stack {tos: 31, mem: [0; 32]}};
+//@ Finally, let's write a function that creates and returns a CPU for us to use.
 
-    /* Because these are circular stacks it doesn't matter where top-of-stack
-     * starts off pointing.  I arbitrarily set it to the highest index so
-     * the first value pushed will wind up at index 0, again because this
-     * makes the stack look nicer when printed out.
-     */
-
-    return c;
-}
-
-/* ---------------------------------------------------------------------------
- * Part 1a - The instruction set
- * ------------------------------------------------------------------------- */
-
-/* Now we have a CPU sitting there but it does nothing.
- *
- * A working CPU would execute a list of instructions.  An instruction is
- * a number that is a command for the CPU.  For example:
- *
- * 65522 might mean "add the top two values on the data stack".
- * 65524 might mean "invert the bits of the top value on the data stack".
- *
- * The map of instruction-to-behavior comes from the CPU's
- * "instruction set" i.e. the set of all possible instructions and their
- * behaviors.
- *
- * Normally you program a CPU by putting instructions into memory and then
- * telling the CPU the memory address where it can find the first instruction.
- *
- * The CPU will:
- * 1. Fetch the instruction (load it from memory)
- * 2. Decode the instruction (look it up in the instruction set)
- * 3. Execute that instruction (do the thing the instruction set said to do)
- * 4. Move on to the next instruction and repeat.
- *
- * So now we will make the CPU do those things.
- * We'll start off by teaching it how to access memory, and then we will
- * define the instruction set.
- */
+use std::convert::TryInto;
 
 impl Core {
-    /* Helper to read a number from the specified memory address. */
-    fn load(&self, addr: u16) -> u16 {
-        let a = addr as usize;
-
-        /* We immediately run into trouble because we are using byte-addressed
-         * memory as mentioned earlier.
-         *
-         * Each memory location stores 8 bits (a byte)
-         *
-         * Our CPU operates on 16 bit values and we want each memory operation
-         * to read/write 16 bits at a time for efficiency reasons.
-         *
-         * What do we do?
-         *
-         * This CPU chooses to do the following:
-         * - Read the low  byte of the 16-bit number from address a
-         * - Read the high byte of the 16-bit number from address a+1
-         *
-         * 16 bit number in CPU: [00000000 00000001]        = 1
-         *                        |        |
-         *                        |        memory address a = 1
-         *                        |
-         *                        memory address a+1        = 0
-         *
-         * This is called "little endian" because the low byte comes first.
-         *
-         * We could have just as easily done the opposite:
-         * - Read the high byte of the 16-bit number from address a
-         * - Read the low  byte of the 16-bit number from address a+1
-         *
-         * 16 bit number in CPU: [00000000 00000001]          = 1
-         *                        |        |
-         *                        |        memory address a+1 = 1
-         *                        |
-         *                        memory address a            = 0
-         *
-         * This is called "big endian" because the high byte comes first.
-         */
-
-        return u16::from_le_bytes(self.ram[a..=a+1].try_into().unwrap());
-
-        /* The le in this function call stands for little-endian. */
+    fn new() -> Core {
+        return Core {
+            ram: [0; ADDRESS_SPACE],
+            ip: 0,
+            dstack: Stack::new(),
+            rstack: Stack::new()}
     }
 
-    /* Helper to write a number to the specified memory address. */
+//@ ## 1.1 - The instruction set
+//@
+//@ Now we have a CPU sitting there but it does nothing.
+//@
+//@ A working CPU would execute a list of instructions.  An instruction is
+//@ a number that is a command for the CPU.  For example:
+//@
+//@ - 65522 might mean "add the top two values on the data stack".
+//@ - 65524 might mean "invert the bits of the top value on the data stack".
+//@
+//@ The map of instruction-to-behavior comes from the CPU's
+//@ "instruction set" i.e. the set of all possible instructions and their
+//@ behaviors.
+//@
+//@ Normally you program a CPU by putting instructions into memory and then
+//@ telling the CPU the memory address where it can find the first instruction.
+//@
+//@ The CPU will:
+//@
+//@ 1. Fetch the instruction (load it from memory)
+//@ 2. Decode the instruction (look it up in the instruction set)
+//@ 3. Execute that instruction (do the thing the instruction set said to do)
+//@ 4. Move on to the next instruction and repeat.
+//@
+//@ So now we will make the CPU do those things.
+//@ We'll start off by teaching it how to access memory, and then we will
+//@ define the instruction set.
+//@
+//@ ### Memory access
+//@
+//@ Now let's write a function to read a number from the specified memory address.
+
+    fn load(&self, addr: u16) -> u16 {
+
+//@ We immediately run into trouble because we are using byte-addressed
+//@ memory as mentioned earlier.
+//@
+//@ Each memory location stores 8 bits (a byte)
+//@
+//@ Our CPU operates on 16 bit values and we want each memory operation
+//@ to read/write 16 bits at a time for efficiency reasons.
+//@
+//@ What do we do?
+//@
+//@ This CPU chooses to do the following:
+//@
+//@ - Read the low byte of the 16-bit number from address a
+//@ - Read the high byte of the 16-bit number from address a+1
+//@
+//@ ```
+//@ 16 bit number in CPU: [00000000 00000001]        = 1
+//@                        |        |
+//@                        |        memory address a = 1
+//@                        |
+//@                        memory address a+1        = 0
+//@ ```
+//@
+//@ This is called
+//@ "[little endian](https://en.wikipedia.org/wiki/Endianness)"
+//@ because the low byte comes first.
+//@
+//@ We could have just as easily done the opposite:
+//@
+//@ - Read the high byte of the 16-bit number from address a
+//@ - Read the low  byte of the 16-bit number from address a+1
+//@
+//@ ```
+//@ 16 bit number in CPU: [00000000 00000001]          = 1
+//@                        |        |
+//@                        |        memory address a+1 = 1
+//@                        |
+//@                        memory address a            = 0
+//@ ```
+//@
+//@ This is called "big endian" because the high byte comes first.
+//@
+//@ The "le" in the function call below stands for little-endian.
+
+        let a = addr as usize;
+        return u16::from_le_bytes(self.ram[a..=a+1].try_into().unwrap());
+    }
+
+//@ Writing to memory is very similar, it just works in the opposite direction.
+
     fn store(&mut self, addr: u16, val: u16) {
         let a = addr as usize;
         self.ram[a..=a+1].copy_from_slice(&val.to_le_bytes());
     }
 
-    /* With that taken care of, we can get around to defining the CPU's
-     * instruction set.
-     *
-     * Each instruction on this CPU will be the same size, 16 bits, for
-     * the following reasons:
-     *
-     * 1. Instruction fetch always completes in 1 read.  You never have to
-     *    go back and fetch more bytes.
-     *
-     * 2. If you put the first instruction at an even numbered address then
-     *    you know all the rest of the instructions will also be at even
-     *    numbered addresses.  I will take advantage of this later.
-     *
-     * 3. A variable length encoding would save space but 2 bytes per
-     *    instruction is already pretty small so it doesn't matter very much.
-     *
-     * Here are the instructions I picked.
-     *
-     *  CALL
-     *  ------------------------------------------------------------+----
-     *  | n   n   n   n   n   n   n   n   n   n   n   n   n   n   n | 0 |
-     *  ------------------------------------------------------------+----
-     *
-     * What CALL does:
-     * ---------------
-     *  - Push instruction pointer onto the return stack.
-     *  - Set instruction pointer to address nnnnnnnnnnnnnnn0.
-     *
-     * This lets you call a subroutine at any even numbered address
-     * from 0 to 65534.
-     *
-     * Why this is useful:
-     * -------------------
-     * Together with the return stack, CALL lets you call subroutines.
-     *
-     * A subroutine is a list of instructions that does something
-     * useful and then returns control to the caller.
-     *
-     * For example:
-     *
-     * Address   Instruction   Meaning
-     * 100 ->           200    Call 200
-     * 102 ->           ???    Add the top two values on the data stack.
-     * ...
-     * 200 ->           ???    Push the value 3 onto the data stack
-     * 202 ->           ???    Push the value 4 onto the data stack
-     * 204 ->           ???    Return to caller
-     *
-     * Don't worry about the other instructions I am using here.  I will
-     * define them later.
-     *
-     * I mostly want to point out the three instructions that I put
-     * at address 200 because they are a subroutine,
-     * a small self contained piece of code (6 bytes) that
-     * performs a specific task.
-     *
-     * Do you think it's cool that you can count exactly how many bytes it
-     * took?  I think it's cool.
-     *
-     * Here is what happens when the CPU begins execution at address 100.
-     *
-     * Address   Data stack   Return stack
-     * 100       []           []    <--- About to call subroutine...
-     * 200       []           [102]
-     * 202       [3]          [102]
-     * 204       [3 4]        [102] <--- About to return from subroutine...
-     * 102       [3 4]        []
-     * 104       [5]          []
-     *
-     * The return stack is there to make sure that returning from a subroutine
-     * goes back to where it came from.  We will talk more about the return
-     * stack later when we talk about the RET instruction.
-     *
-     * Limitations of CALL:
-     * --------------------
-     * This CPU cannot call an instruction that starts at an odd address.
-     * a.k.a. "unaligned call" is impossible.
-     *
-     * At first this seems like a limitation, but it really isn't.
-     * If you put the first instruction at an even numbered address then
-     * all the rest of the instructions will also be at even numbered
-     * addresses.  So this works fine.
-     *
-     * Of course if you intersperse instructions and data in memory...
-     *            _________
-     *  ________ |_________| _____________
-     * |________|    Data   |_____________|
-     * Instructions         More instructions
-     *
-     * ...then you will have to be careful to make sure the second block
-     * of instructions also starts at an even numbered address.
-     * You might need to include an extra byte of data as "padding".
-     *
-     *  Data processing instructions
-     *  --------------------------------------------+---------------+----
-     *  | 1   1   1   1   1   1   1   1   1   1   1 | x   x   x   x | 0 |
-     *  --------------------------------------------+---------------+----
-     * Sixteen of the even numbers are reserved for additional instructions
-     * that will be be described later.
-     *
-     * The even numbers 1111111111100000 to 1111111111111110 (65504 to 65534)
-     * are reserved for these instructions.  This means that CALL 65504 through
-     * CALL 65534 are not possible.  Put another way, it is not possible to
-     * call a subroutine living in the top 32 bytes of memory.  This is not a
-     * very severe limitation.
-     *
-     *  LITERAL
-     *  ------------------------------------------------------------+----
-     *  | n   n   n   n   n   n   n   n   n   n   n   n   n   n   n | 1 |
-     *  ------------------------------------------------------------+----
-     *
-     * What LITERAL does
-     * -----------------
-     * - Place the value 0nnnnnnnnnnnnnnn on the data stack.
-     *
-     * Why this is useful:
-     * -------------------
-     * Program will often need to deal with constant numbers.
-     * For example, you might want to add 2 to a memory address (to move
-     * on to the next even-numbered address) or add 32 to a character code
-     * (to convert an uppercase letter to lowercase).  These constants have
-     * to come from somewhere.
-     *
-     * Limitations of LITERAL:
-     * -----------------------
-     * To differentiate it from a call, this instruction is always an
-     * odd number.  The trailing 1 is discarded before placing the number on
-     * the data stack.  This missing bit means that only 2^15 values can be
-     * represented (0 to 32767).  32768 on up cannot be stored directly.
-     * You would need to do some follow-up math to get these numbers.
-     * The most direct way is to use the INV instruction, described later.
-     */
-
-     /* Now that the instruction set is generally described
-      * let's look at the code that implements it */
+//@ With that taken care of, we can get around to defining the CPU's
+//@ instruction set.
+//@
+//@ ### Designing the instruction set
+//@
+//@ Each instruction on this CPU will be the same size, 16 bits, for
+//@ the following reasons:
+//@
+//@ 1. Instruction fetch always completes in 1 read.  You never have to
+//@    go back and fetch more bytes.
+//@
+//@ 2. If you put the first instruction at an even numbered address then
+//@    you know all the rest of the instructions will also be at even
+//@    numbered addresses.  I will take advantage of this later.
+//@
+//@ 3. A variable length encoding would save space but 2 bytes per
+//@    instruction is already pretty small so it doesn't matter very much.
+//@
+//@ Here are the instructions I picked.
+//@
+//@ #### The CALL instruction
+//@
+//@ ```
+//@ CALL
+//@ ------------------------------------------------------------+----
+//@ | n   n   n   n   n   n   n   n   n   n   n   n   n   n   n | 0 |
+//@ ------------------------------------------------------------+----
+//@ ```
+//@
+//@ ##### What CALL does
+//@
+//@ - Push instruction pointer onto the return stack.
+//@ - Set instruction pointer to address nnnnnnnnnnnnnnn0.
+//@
+//@ This lets you call a subroutine at any even numbered address
+//@ from 0 to 65534.
+//@
+//@ ##### Why this is useful
+//@
+//@ Together with the return stack, CALL lets you call subroutines.
+//@
+//@ A subroutine is a list of instructions that does something
+//@ useful and then returns control to the caller.
+//@
+//@ For example:
+//@
+//@ ```
+//@ Address   Instruction   Meaning
+//@ 100 ->           200    Call 200
+//@ 102 ->           ???    Add the top two values on the data stack.
+//@ ...
+//@ 200 ->           ???    Push the value 3 onto the data stack
+//@ 202 ->           ???    Push the value 4 onto the data stack
+//@ 204 ->           ???    Return to caller
+//@ ```
+//@
+//@ Don't worry about the other instructions I am using here.  I will
+//@ define them later.
+//@
+//@ I mostly want to point out the three instructions that I put
+//@ at address 200 because they are a subroutine,
+//@ a small self contained piece of code (6 bytes) that
+//@ performs a specific task.
+//@
+//@ Do you think it's cool that you can count exactly how many bytes it
+//@ took?  I think it's cool.
+//@
+//@ Here is what happens when the CPU begins execution at address 100.
+//@
+//@ ```
+//@ Address   Data stack   Return stack
+//@ 100       []           []    <--- About to call subroutine...
+//@ 200       []           [102]
+//@ 202       [3]          [102]
+//@ 204       [3 4]        [102] <--- About to return from subroutine...
+//@ 102       [3 4]        []
+//@ 104       [7]          []
+//@ ```
+//@
+//@  The return stack is there to make sure that returning from a subroutine
+//@  goes back to where it came from.  We will talk more about the return
+//@  stack later when we talk about the RET instruction.
+//@
+//@ ##### Limitations of CALL:
+//@
+//@ This CPU cannot call an instruction that starts at an odd address.
+//@
+//@ At first this seems like a limitation, but it really isn't.
+//@ If you put the first instruction at an even numbered address then
+//@ all the rest of the instructions will also be at even numbered
+//@ addresses.  So this works fine.
+//@
+//@ Of course if you intersperse instructions and data in memory...
+//@
+//@ ```
+//@            _________
+//@  ________ |_________| _____________
+//@ |________|    Data   |_____________|
+//@ Instructions         More instructions
+//@ ```
+//@
+//@ ...then you will have to be careful to make sure the second block
+//@ of instructions also starts at an even numbered address.
+//@ You might need to include an extra byte of data as
+//@ "[padding](https://en.wikipedia.org/wiki/Data_structure_alignment#Data_structure_padding)".
+//@
+//@ #### Data processing instructions
+//@ ```
+//@ Data processing instructions
+//@ --------------------------------------------+---------------+----
+//@ | 1   1   1   1   1   1   1   1   1   1   1 | x   x   x   x | 0 |
+//@ --------------------------------------------+---------------+----
+//@ ```
+//@
+//@ Sixteen of the even numbers are reserved for additional instructions
+//@ that will be be described later.
+//@
+//@ The even numbers 1111111111100000 to 1111111111111110 (65504 to 65534)
+//@ are reserved for these instructions.  This means that CALL 65504 through
+//@ CALL 65534 are not possible.  Put another way, it is not possible to
+//@ call a subroutine living in the top 32 bytes of memory.  This is not a
+//@ very severe limitation.
+//@
+//@ #### The LITERAL instruction
+//@ ```
+//@ LITERAL
+//@ ------------------------------------------------------------+----
+//@ | n   n   n   n   n   n   n   n   n   n   n   n   n   n   n | 1 |
+//@ ------------------------------------------------------------+----
+//@ ```
+//@
+//@ ##### What LITERAL does
+//@
+//@ - Place the value 0nnnnnnnnnnnnnnn on the data stack.
+//@
+//@ ##### Why this is useful:
+//@
+//@ Programs will often need to deal with constant numbers.
+//@ For example, you might want to add 2 to a memory address (to move
+//@ on to the next even-numbered address) or add 32 to a character code
+//@ (to convert an uppercase letter to lowercase).  These constants have
+//@ to come from somewhere.
+//@
+//@ ##### Limitations of LITERAL:
+//@
+//@ To differentiate it from a call, this instruction is always an
+//@ odd number.  The trailing 1 is discarded before placing the number on
+//@ the data stack.  This missing bit means that only 2^15 values can be
+//@ represented (0 to 32767).  32768 on up cannot be stored directly.
+//@ You would need to do some follow-up math to get these numbers.
+//@ The most direct way is to use the INV instruction, described later.
+//@
+//@ ### Making the CPU run
+//@
+//@ Now that the instruction set is generally described let's look at
+//@ the code that implements it.
 
     fn step(&mut self) {
 
@@ -428,6 +511,7 @@ impl Core {
         /* 2. Decode and execute the instruction */
 
         if (opcode >= 0xffe0) && (opcode & 1 == 0) {
+
             /* Data processing instruction */
 
             PRIMITIVES[((opcode - 0xffe0) >> 1) as usize](self);
@@ -444,6 +528,7 @@ impl Core {
              * The table will be described below, and these instructions
              * explained.
              */
+
         }
         else if (opcode & 1) == 1 {
             /* Literal */
@@ -457,7 +542,11 @@ impl Core {
     }
 }
 
-/* The names of the 16 remaining CPU instructions */
+//@ The CALL and LITERAL instructions are directly handled above.
+//@
+//@ The 16 data processing instructions are each assigned a number in the
+//@ appropriate range that we carved out for them...
+
 enum Op {
     RET = 0xffe0, TOR = 0xffe2, RTO = 0xffe4, LD  = 0xffe6,
     ST  = 0xffe8, DUP = 0xffea, SWP = 0xffec, DRP = 0xffee,
@@ -465,29 +554,38 @@ enum Op {
     AND = 0xfff8, INV = 0xfffa, GEQ = 0xfffc, IO  = 0xfffe,
 }
 
+//@ ...which is then looked up in the table below.  This table gives each
+//@ instruction its unique behavior.
+
 type Primitive = fn(&mut Core);
 
-/* A table of functions for each of the 16 remaining CPU instructions */
 const PRIMITIVES: [Primitive; 16] = [
-    /* Return-stack instructions */
+
+//@ #### Return-stack instructions
+
     | x | {
         /* RET - Return from subroutine */
         x.ip = x.rstack.pop()
     },
+
     | x | {
         /* TOR - Transfer number from data stack to return stack */
         x.rstack.push(x.dstack.pop())
     },
+
     | x | {
         /* RTO - Transfer number from return stack to data stack */
         x.dstack.push(x.rstack.pop())
     },
-    /* Memory instructions */
+
+//@ #### Memory instructions
+
     | x | {
         /* LD - Load number from memory address specified on the data stack */
         let a = x.dstack.pop();
         x.dstack.push(x.load(a));
     },
+
     | x | {
         /* ST - Store number to memory address specified on the data stack */
         let a = x.dstack.pop();
@@ -495,18 +593,17 @@ const PRIMITIVES: [Primitive; 16] = [
         x.store(a, v);
     },
 
-    /* Stack shuffling instructions
-     *
-     * Remember the problem of "register allocation" mentioned earlier,
-     * and how stack machines are supposed to avoid that problem?  Well,
-     * nothing comes for free.  Stack machines can only process the top
-     * value(s) on the stack.  So sometimes you will have to do some work
-     * to "unbury" a crucial value and move it to the top of the stack.
-     * That's what these instructions are for.
-     *
-     * Their use will become more obvious when we start programming the
-     * machine, soon.
-     */
+//@ #### Stack shuffling instructions
+//@
+//@ Remember the problem of "register allocation" mentioned earlier,
+//@ and how stack machines are supposed to avoid that problem?  Well,
+//@ nothing comes for free.  Stack machines can only process the top
+//@ value(s) on the stack.  So sometimes you will have to do some work
+//@ to "unbury" a crucial value and move it to the top of the stack.
+//@ That's what these instructions are for.
+//@
+//@ Their use will become more obvious when we start programming the
+//@ machine, soon.
 
     | x | {
         /* DUP - Duplicate the top number on the data stack */
@@ -514,6 +611,7 @@ const PRIMITIVES: [Primitive; 16] = [
         x.dstack.push(v);
         x.dstack.push(v);
     },
+
     | x | {
         /* SWP - Exchange the top two numbers on the data stack */
         let v1 = x.dstack.pop();
@@ -521,35 +619,40 @@ const PRIMITIVES: [Primitive; 16] = [
         x.dstack.push(v1);
         x.dstack.push(v2);
     },
+
     | x | {
         /* DRP - Discard the top number on the data stack */
         let _ = x.dstack.pop();
     },
-    /* Conditional skip instruction */
+
+//@ #### Conditional skip instruction
+//@
+//@ We only have one of these: "Q".  This is the only "decision-making"
+//@ instruction that our CPU has.  This means that all "if-then" logic,
+//@ counted loops, etc.  will be built using Q.
+
     | x | {
         /* Q - If the top number on the data stack is zero, skip the next
-         * instruction.
-         *
-         * Note Q is the only "decision-making" instruction that our CPU
-         * has.  This means that all "if-then" logic, counted loops, etc.
-         * will be built using Q.
-         */
+         * instruction. */
 
         let f = x.dstack.pop();
         if f == 0 {
             x.ip = x.ip.wrapping_add(2)
-
-            /* Because all of our instructions are two bytes, adding two
-             * to the instruction pointer skips the next instruction. */
-        };
+        }
     },
-    /* Arithmetic and logic */
+
+//@ Because all of our instructions are two bytes, adding two to the
+//@ instruction pointer skips the next instruction.
+//@
+//@ #### Arithmetic and logic
+
     | x | {
         /* ADD - Sum the top two numbers on the data stack. */
         let v1 = x.dstack.pop();
         let v2 = x.dstack.pop();
         x.dstack.push(v1.wrapping_add(v2));
     },
+
     | x | {
         /* SFT - Bit shift number left or right by the specified amount.
          * A positive shift amount will shift left, negative will shift right.
@@ -567,65 +670,77 @@ const PRIMITIVES: [Primitive; 16] = [
             }
         );
     },
+
     | x | { // OR - Bitwise-or the top two numbers on the data stack.
         let v1 = x.dstack.pop();
         let v2 = x.dstack.pop();
         x.dstack.push(v1 | v2);
     },
+
     | x | { // AND - Bitwise-and the top two numbers on the data stack.
         let v1 = x.dstack.pop();
         let v2 = x.dstack.pop();
         x.dstack.push(v1 & v2);
     },
+
     | x | { // INV - Bitwise-invert the top number on the data stack.
         let v1 = x.dstack.pop();
         x.dstack.push(!v1);
-
-        /* You can use the INV instruction to compensate for the LITERAL
-         * instruction's inability to encode constants 32768 to 65535.
-         * Use two instructions instead:
-         * - LITERAL the complement of your desired constant
-         * - INV
-         *
-         * For example, LITERAL(0) INV yields 65535 (signed -1)
-         * For example, LITERAL(1) INV yields 65534 (signed -2)
-         * etc.
-         */
     },
+
+//@ You can use the INV instruction to compensate for the LITERAL
+//@ instruction's inability to encode constants 32768 to 65535,
+//@ a.k.a. the
+//@ [signed](https://en.wikipedia.org/wiki/Two%27s_complement)
+//@ negative numbers.
+//@
+//@ Use two instructions instead:
+//@
+//@ - LITERAL the complement of your desired constant
+//@ - INV
+//@
+//@ For example,
+//@
+//@ - LITERAL(0) INV yields 65535 (signed -1)
+//@ - LITERAL(1) INV yields 65534 (signed -2)
+//@ - etc.
+
     | x | { // GEQ - Unsigned-compare the top two items on the data stack.
         let v2 = x.dstack.pop();
         let v1 = x.dstack.pop();
         x.dstack.push(if v1 >= v2 { 0xffff } else { 0 });
     },
 
-    /* Input/output.
-     *
-     * The CPU needs some way to communicate with the outside world.
-     *
-     * Some machines use memory mapped IO where certain memory addresses are
-     * routed to hardware devices instead of main memory.  This machine already
-     * has the full 64K of memory connected so no address space is readily
-     * available for hardware devices.
-     *
-     * Instead we define a separate input-output space of 65536 possible
-     * locations.  Each of these possible locations is called an IO "port".
-     *
-     * For a real CPU you could hook up hardware such as a serial
-     * transmitter that sends data to a computer terminal, or just an
-     * output pin controller that is wired to a light bulb.
-     *
-     * This is a fake software CPU so I am going to hook it up to
-     * stdin and stdout.
-     */
+//@ #### Input/output
+//@
+//@ The CPU needs some way to communicate with the outside world.
+//@
+//@ Some machines use memory mapped IO where certain memory addresses are
+//@ routed to hardware devices instead of main memory.  This machine already
+//@ has the full 64K of memory connected so no address space is readily
+//@ available for hardware devices.
+//@ Instead we define a separate input-output space of 65536 possible
+//@ locations.  Each of these possible locations is called an IO
+//@ "[port](https://en.wikipedia.org/wiki/IO_port)".
 
     | x | { // IO - Write/read a number from/to input/output port.
         let port = x.dstack.pop();
 
-        /* I'm loosely following a pattern in which even ports are inputs
-         * and odd ports are outputs.  But each port acts different.
-         * In a hardware CPU this would not be suitable but it is fine for
-         * a software emulation.
-         */
+//@ For a real CPU you could hook up hardware such as a serial
+//@ transmitter that sends data to a computer terminal, or just an
+//@ output pin controller that is wired to a light bulb.
+//@
+//@ This is a fake software CPU so I am going to hook it up to
+//@ [stdin and stdout](https://en.wikipedia.org/wiki/Standard_streams).
+
+        use std::io;
+        use std::io::Read;
+        use std::io::Write;
+
+//@ I'm loosely following a pattern in which even ports are inputs
+//@ and odd ports are outputs.  But each port acts different.
+//@ In a hardware CPU this would not be suitable but it is fine for
+//@ a software emulation.
 
         match port {
             0 => {
@@ -646,126 +761,128 @@ const PRIMITIVES: [Primitive; 16] = [
                 /* Dump CPU status.
                  * Like the front panel with the blinking lights that Chuck
                  * talked about. */
-
                 println!("{:?} {:?}", x.ip, x.dstack);
                 let _ = io::stdout().flush();
             }
             _ => {}
         }
     }
+
+//@ That's all the CPU instructions we'll need.
+
 ];
 
-/* ---------------------------------------------------------------------------
- *                            Part 2 - The Program
- * ------------------------------------------------------------------------- */
-
-/* You now have an unfamiliar computer with no software.  Can you and the
- * computer write a program?
- *
- * The first program is the hardest to write because you don't have any tools
- * to help write it.  The computer itself is going to be no help.  Without any
- * program it will sit there doing nothing.
- *
- * What should the first program be?
- * A natural choice would be a tool that helps you program more easily.
- *
- * An interactive programming environment needs to let you do 2 things:
- *
- * 1. Call subroutines by typing their name at the keyboard
- * 2. Define new subroutines in terms of existing ones
- *
- * Begin with step 1:
- * Call subroutines by typing their name at the keyboard
- *
- * This is where we will meet Forth.
- *
- * Our interactive programming environment will be a small language in the
- * Forth family.  If you want to learn how to implement a full featured Forth,
- * please read Jonesforth, and Brad Rodriguez' series of articles "Moving
- * Forth".  The small Forth I write below will probably help you understand
- * those Forths a little better.
- *
- * Forth organizes all the computer's memory as a "dictionary" of subroutines.
- * The point of the dictionary is to give each subroutine a name so you
- * can run a subroutine by typing its name.  The computer will look up its
- * address for you and call it.
- *
- * The dictionary starts at a low address and grows towards high addresses.
- * It is organized as a linked list, like this:
- *
- * [Link field][Name][Code .......... ]
- *  ^
- *  |
- * [Link field][Name][Code ...... ]
- *  ^
- *  |
- * [Link field][Name][Code ............... ]
- *
- * The reason it is a linked list is to allow each list entry to be a
- * different length.
- *
- * Each dictionary entry contains three things:
- *
- * - "Link field": The address of the previous dictionary entry.
- *                 For the first dictionary entry this field is 0.
- *
- * - "Name": A few letters to name this dictionary entry.
- *           Later you will type this name at the keyboard to call up
- *           this dictionary entry.
- *
- * - "Code": A subroutine to execute when you call up this dictionary
- *           entry.  This is a list of CPU instructions.  Note that one
- *           of the CPU instructions is "call".  So you can have a subroutine
- *           that call other subroutines, or calls itself.
- *
- *           This code should end with a return (RET) instruction.
- *
- *           Example subroutine:
- *
- *           Number Instruction  Meaning
- *           ------ -----------  -------
- *           7      Literal(3)   Push the value 3 onto the data stack
- *           9      Literal(4)   Push the value 4 onto the data stack
- *           65504  RET          Return to caller
- *
- * A linked list is not a very fast data structure but this doesn't really
- * matter because dictionary lookup doesn't need to be fast.  Lookups are
- * for converting text you typed at the keyboard to subroutine addresses.
- * You can't type very fast compared to a computer so this lookup doesn't
- * need to be fast.
- *
- * In addition to the linked list itself, you will need a couple of
- * variables to keep track of where the dictionary is in memory:
- *
- * - Dictionary pointer:  The address of the newest dictionary entry.
- * - Here:                The address of the first unused memory location,
- *                        which comes just after the newest dictionary entry.
- *
- * [Link field][Name][Code .......... ]
- *  ^
- *  |
- * [Link field][Name][Code ...... ]
- *  ^
- *  |
- * [Link field][Name][Code ............... ]
- *  ^                                       ^
- *  |                                       |
- * [Dictionary pointer]                    [Here]
- *
- * To create our Forth interactive programmming environment, we will start
- * by defining subroutines that:
- * - read names from the keyboard
- * - look up and execute dictionary entries by name
- *
- * We will put these subroutines themselves in the dictionary so they are
- * available for use once our interactive environment is up and running!
- *
- * If you were sitting in front of a minicomputer in 196x you would need
- * to create the dictionary with pencil and paper, but in 20xx we will
- * write a Rust program to help create the dictionary.
- *
- * First we need to keep track of where the dictionary is:
- */
+//@ # Part 2 - The Program
+//@
+//@ You now have an unfamiliar computer with no software.  Can you and the
+//@ computer write a program?
+//@
+//@ The first program is the hardest to write because you don't have any tools
+//@ to help write it.  The computer itself is going to be no help.  Without any
+//@ program it will sit there doing nothing.
+//@
+//@ What should the first program be?
+//@ A natural choice would be a tool that helps you program more easily.
+//@
+//@ An interactive programming environment needs to let you do 2 things:
+//@
+//@ 1. Call subroutines by typing their name at the keyboard
+//@ 2. Define new subroutines in terms of existing ones
+//@
+//@ Begin with step 1:
+//@ Call subroutines by typing their name at the keyboard
+//@
+//@ This is where we will meet Forth.
+//@
+//@ Our interactive programming environment will be a small language in the
+//@ Forth family.  If you want to learn how to implement a full featured Forth,
+//@ please read
+//@ [Jonesforth](http://git.annexia.org/?p=jonesforth.git;a=blob;f=jonesforth.S),
+//@ and Brad Rodriguez' series of articles
+//@ "[Moving Forth](http://www.bradrodriguez.com/papers/index.html)".
+//@ The small Forth I write below will probably help you understand
+//@ those Forths a little better.
+//@
+//@ Forth organizes all the computer's memory as a "dictionary" of subroutines.
+//@ The point of the dictionary is to give each subroutine a name so you
+//@ can run a subroutine by typing its name.  The computer will look up its
+//@ address for you and call it.
+//@
+//@ ### Designing the Forth dictionary
+//@
+//@ The dictionary starts at a low address and grows towards high addresses.
+//@ It is organized as a
+//@ [linked list](https://en.wikipedia.org/wiki/Linked_list), like this:
+//@
+//@ ```
+//@ [Link field][Name][Code .......... ]
+//@  ^
+//@  |
+//@ [Link field][Name][Code ...... ]
+//@  ^
+//@  |
+//@ [Link field][Name][Code ............... ]
+//@ ```
+//@
+//@ The reason it is a linked list is to allow each list entry to be a
+//@ different length.
+//@
+//@ Each dictionary entry contains three things:
+//@
+//@ - "Link field": The address of the previous dictionary entry.
+//@                 For the first dictionary entry this field is 0.
+//@
+//@ - "Name": A few letters to name this dictionary entry.
+//@           Later you will type this name at the keyboard to call up
+//@           this dictionary entry.
+//@
+//@ - "Code": A subroutine to execute when you call up this dictionary
+//@           entry.  This is a list of CPU instructions.  Note that one
+//@           of the CPU instructions is "call".  So you can have a subroutine
+//@           that call other subroutines, or calls itself.  This code should
+//@           end with a return (RET) instruction.  Here is an example subroutine:
+//@
+//@ ```
+//@ Number Instruction  Meaning
+//@ ------ -----------  -------
+//@ 7      Literal(3)   Push the value 3 onto the data stack
+//@ 9      Literal(4)   Push the value 4 onto the data stack
+//@ 65504  RET          Return to caller
+//@ ```
+//@
+//@ A linked list is not a very fast data structure but this doesn't really
+//@ matter because dictionary lookup doesn't need to be fast.  Lookups are
+//@ for converting text you typed at the keyboard to subroutine addresses.
+//@ You can't type very fast compared to a computer so this lookup doesn't
+//@ need to be fast.
+//@
+//@ In addition to the linked list itself, you will need a couple of
+//@ variables to keep track of where the dictionary is in memory:
+//@
+//@ - Dictionary pointer:  The address of the newest dictionary entry.
+//@ - Here:                The address of the first unused memory location,
+//@                        which comes just after the newest dictionary entry.
+//@
+//@ ```
+//@ [Link field][Name][Code .......... ]
+//@  ^
+//@  |
+//@ [Link field][Name][Code ...... ]
+//@  ^
+//@  |
+//@ [Link field][Name][Code ............... ]
+//@  ^                                       ^
+//@  |                                       |
+//@ [Dictionary pointer]                    [Here]
+//@ ```
+//@
+//@ ### Tools for building the Forth dictionary
+//@
+//@ If you were sitting in front of a minicomputer in 196x you would need
+//@ to create the dictionary with pencil and paper, but in 20xx we will
+//@ write a Rust program to help create the dictionary.
+//@
+//@ First we need to keep track of where the dictionary is:
 
 struct Dict<'a> {
     dp: u16,   // The dictionary pointer
@@ -775,7 +892,7 @@ struct Dict<'a> {
                      // us easy access to the memory.
 }
 
-/* Helpers to help put new routines in the dictionary */
+//@ Now we can write functions in Rust to help us build the dictionary.
 
 enum Item {
     Literal(u16),
@@ -812,35 +929,43 @@ impl Dict<'_> {
         }
     }
 
-    /* Helper to append a "name" field to the dictionary.  To save space and
-     * to make each dictionary header a consistent size, I am choosing to not
-     * store every letter of the name.  Instead I am storing only the length of
-     * the name and then the first three letters of the name.
-     *
-     * That means these two names will compare equal:
-     * - ALLOW (-> 5ALL)
-     * - ALLOT (-> 5ALL)
-     *
-     * Even though their first three letters are the same, these two names
-     * will compare unequal because they are different lengths:
-     * - FORTH (-> 5FOR)
-     * - FORGET (-> 6FOR)
-     *
-     * If a name is shorter than 3 letters it is padded out with spaces.
-     * - X (-> 1X  )
-     *
-     * You can see that the name field is always four bytes regardless
-     * of how many letters are in the name, and the link field is two bytes.
-     * This means a dictionary header in this Forth is always six bytes.
-     */
+    /* Helper to append a "name" field to the dictionary. */
+
+//@ The "name" field bears a closer look.  To make each dictionary header a
+//@ consistent size, I am choosing to not store every letter of the name.
+//@ Instead I am storing only the length of the name and then the first
+//@ three letters of the name.
+//@
+//@ That means these two names will compare equal:
+//@
+//@ - ALLOW (-> 5ALL)
+//@ - ALLOT (-> 5ALL)
+//@
+//@ Even though their first three letters are the same, these two names
+//@ will compare unequal because they are different lengths:
+//@
+//@ - FORTH (-> 5FOR)
+//@ - FORGET (-> 6FOR)
+//@
+//@ If a name is shorter than 3 letters it is padded out with spaces.
+//@
+//@ - X (-> `1X  `)
+//@
+//@ You can see that the name field is always four bytes regardless
+//@ of how many letters are in the name, and the link field is two bytes.
+//@ This means a dictionary header in this Forth is always six bytes.
 
     fn name(&mut self, n: u8, val: [u8; 3]) {
         /* Store the length and the first character */
         self.comma(n as u16 | ((val[0] as u16) << 8));
+
         /* Store the next two characters */
         self.comma(val[1] as u16 | ((val[2] as u16) << 8));
     }
 
+//@ Finally, a function that appends a new link field to the dictionary,
+//@ pointing to the previous dictionary entry.
+
     /* Helper to append a new link field to the dictionary and update the
      * dictionary pointer appropriately. */
 
@@ -851,7 +976,20 @@ impl Dict<'_> {
     }
 }
 
-/* Now we can start building the dictionary. */
+
+//@ Now we can start building the dictionary.
+//@
+//@ To create our Forth interactive programmming environment, we will start
+//@ by defining subroutines that:
+//@
+//@ - read names from the keyboard
+//@ - look up and execute dictionary entries by name
+//@
+//@ We will put these subroutines themselves in the dictionary so they are
+//@ available for use once our interactive environment is up and running!
+//@
+//@ ### Building the Forth dictionary
+
 fn build_dictionary(c: &mut Core) {
     use Op::*;
     use Item::*;
@@ -859,100 +997,118 @@ fn build_dictionary(c: &mut Core) {
     let mut d = Dict {
         dp: 0,  /* Nothing in the dictionary yet */
 
-        here: 2,  /* Reserve address 0 as an "entry point", i.e. where the
+        here: 2,  /* Reserve address 0 as the "reset vector", i.e. where the
                      CPU will jump to start running Forth.  We don't have a
                      Forth interpreter yet so we'll leave address 0 alone for
                      now and start the dictionary at address 2 instead. */
         c: c
     };
 
-    /* Consider the following facts:
-     * - The CPU knows how to execute a bunch of instructions strung together.
-     * - Forth consists of a bunch of subroutine calls strung together.
-     * - Subroutine CALL is a valid instruction of our CPU.
-     *
-     * This means that we can immediately begin programming our machine in
-     * a language resembling Forth, just by writing a list of subroutine
-     * calls into the dictionary.
-     *
-     * The line between "machine code program" and "Forth program" is
-     * very blurry.  To illustrate:
-     *
-     * Here is a subroutine consisting of a few instructions strung together.
-     *
-     *       Instruction Number  Meaning
-     *       ----------- ------  -------
-     *       Literal(3)  7       Push the value 3 onto the data stack
-     *       Literal(4)  9       Push the value 4 onto the data stack
-     *       RET         65504   Return to caller
-     *
-     * Here is a Forth subroutine consisting of a few subroutine calls strung
-     * together.
-     *       Call        Number  Meaning
-     *       ----------- ------  -------
-     *       S1          1230    Call subroutine S1 which happens to live
-     *                           at address 1230
-     *       S2          1250    Call subroutine S2 which happens to live
-     *                           at address 1250
-     *       RET         65504   Return to caller
-     *
-     * Both of these are valid machine code programs (list of numbers that
-     * our CPU can directly execute).
-     *
-     * This duality between CPU instructions and Forth code comes from
-     * an idea called "subroutine threading".  It is a refinement of an
-     * idea called "threaded code".  This has no relation to the kind of
-     * threading that lets you run programs in parallel.  You can read more
-     * about threaded code on Wikipedia or in the other Forth resources I
-     * mentioned earlier (Jonesforth, and Moving Forth by Brad Rodriguez).
-     *
-     * Our new language starts out with the sixteen (well, eighteen)
-     * instructions built into the CPU.  We can string those instructions
-     * together into a new subroutine.  Each new subroutine adds to the
-     * toolbox we have available for making the next new subroutine.
-     * Repeat until you have built what you wanted to build, via
-     * function composition.  This is the idea behind Forth.
-     */
+//@ #### Subroutine threading
+//@
+//@ Consider the following facts:
+//@
+//@ - The CPU knows how to execute a bunch of instructions strung together.
+//@ - Forth consists of a bunch of subroutine calls strung together.
+//@ - Subroutine CALL is a valid instruction of our CPU.
+//@
+//@ This means that we can immediately begin programming our machine in
+//@ a language resembling Forth, just by writing a list of subroutine
+//@ calls into the dictionary.
+//@
+//@ The line between "machine code program" and "Forth program" is
+//@ very blurry.  To illustrate:
+//@
+//@ Here is a subroutine consisting of a few instructions strung together.
+//@
+//@ ```
+//@ Instruction Number  Meaning
+//@ ----------- ------  -------
+//@ Literal(3)  7       Push the value 3 onto the data stack
+//@ Literal(4)  9       Push the value 4 onto the data stack
+//@ RET         65504   Return to caller
+//@ ```
+//@
+//@ Here is a Forth subroutine consisting of a few subroutine calls strung
+//@ together.
+//@
+//@ ```
+//@ Call        Number  Meaning
+//@ ----------- ------  -------
+//@ S1          1230    Call subroutine S1 which happens to live
+//@                     at address 1230
+//@ S2          1250    Call subroutine S2 which happens to live
+//@                     at address 1250
+//@ RET         65504   Return to caller
+//@ ```
+//@
+//@ Both of these are valid machine code programs (list of numbers that
+//@ our CPU can directly execute).
+//@
+//@ This duality between CPU instructions and Forth code comes from
+//@ an idea called "subroutine threading".  It is a refinement of an
+//@ idea called
+//@ "[threaded code](https://en.wikipedia.org/wiki/Threaded_code)".
+//@ This has no relation to the kind of
+//@ threading that lets you run programs in parallel.  You can read more
+//@ about threaded code on Wikipedia or in the other Forth resources I
+//@ mentioned earlier (Jonesforth, and Moving Forth by Brad Rodriguez).
+//@
+//@ Our new language starts out with the sixteen (well, eighteen)
+//@ instructions built into the CPU.  We can string those instructions
+//@ together into a new subroutine.  Each new subroutine adds to the
+//@ toolbox we have available for making the next new subroutine.
+//@ Repeat until you have built what you wanted to build, via
+//@ function composition.  This is the idea behind Forth.
+//@
+//@ We are going to be writing many series of instructions so let's
+//@ start out by making a Rust macro that makes them easier to type
+//@ and lets us specify a CPU instruction vs. a subroutine call with
+//@ equal ease.
+//@
+//@ The macro below will convert:
+//@
+//@ - `forth!(Literal(2), ADD, RET);`
+//@
+//@ to:
+//@
+//@ - `d.emit(Literal(2));`
+//@ - `d.emit(ADD);`
+//@ - `d.emit(RET);`
+//@
+//@ which you probably recognize as code that will add a new subroutine
+//@ to the dictionary.
 
-    /*
-     * We are going to be writing many series of instructions so let's
-     * start out by making a Rust macro that makes them easier to type
-     * and lets us specify a CPU instruction vs. a subroutine call with
-     * equal ease.
-     *
-     * The macro below will convert:
-     *
-     *     forth!(Literal(2), ADD, RET)
-     *
-     * to:
-     *
-     *     d.emit(Literal(2));
-     *     d.emit(ADD);
-     *     d.emit(RET);
-     *
-     * which you probably recognize as code that will add a new subroutine
-     * to the dictionary.
-     */
     macro_rules! forth {
         ($x:expr) => (d.emit($x));
         ($x:expr, $($y:expr),+) => (d.emit($x); forth!($($y),+))
     }
 
-    /* Now we can add the first subroutine to the dictionary!
-     *
-     * key: Reads a character from the keyboard and places its character
-     * code on the stack.
-     *
-     * There is a tradition of writing stack comments for Forth subroutines
-     * to describe the stack effect of executing the subroutine.
-     * They look like this: key ( -- n )
-     *
-     * Read as: key does not take any parameters off the stack, and leaves
-     * one new number pushed onto the stack.
-     *
-     * Also remember that a dictionary entry looks like this:
-     * [Link field][Name][Code .......... ]
-     */
+//@ Now we can add the first subroutine to the dictionary!
+//@
+//@ #### key
+//@ "key" reads a character from the keyboard and places its character
+//@ code on the stack.
+//@
+//@ There is a tradition of writing stack comments for Forth subroutines
+//@ to describe the stack effect of executing the subroutine.
+//@ They look like this:
+//@
+//@ ```
+//@ key ( -- n )
+//@ ```
+//@
+//@ Read as: key does not take any parameters off the stack, and leaves
+//@ one new number pushed onto the stack.
+//@
+//@ Also remember that a dictionary entry looks like this:
+//@
+//@ ```
+//@ [Link field][Name][Code .......... ]
+//@ ```
+//@
+//@ Given all of the above, we are now ready to define "key" and add it to
+//@ the dictionary.
 
     // key ( -- n )
     d.entry();           /* Compile the link field into the dictionary */
@@ -976,45 +1132,54 @@ fn build_dictionary(c: &mut Core) {
         RET              /* Compile a RET instruction */
     );
 
-    /* We have now compiled the "key" subroutine into the dictionary.
-     * [Link field][Name][Code .......... ]
-     *        0000  3key  1, 65534, 65504
-     *
-     * The next subroutine we will make is "emit".  This is a companion
-     * to "key" that works in the opposite direction.
-     *
-     * key ( -- n ) reads a character from stdin and pushes it to the stack.
-     * emit ( n -- ) pops a character from the stack and writes it to stdout.
-     */
+//@ We have now compiled the "key" subroutine into the dictionary.
+//@ It takes twelve bytes of memory and is laid out as shown below:
+//@
+//@ ```
+//@ [Link field][Name][Code .......... ]
+//@        0000  3key  1, 65534, 65504
+//@ ```
+//@
+//@ #### emit
+//@
+//@ The next subroutine we will make is "emit".  This is a companion
+//@ to "key" that works in the opposite direction.
+//@
+//@ - key ( -- n ) reads a character from stdin and pushes it to the stack.
+//@ - emit ( n -- ) pops a character from the stack and writes it to stdout.
 
     // emit ( n -- )
     d.entry(); d.name(4, *b"emi");  let emit = d.here;
-    forth!(Literal(1), IO, RET);
+    forth!(
+        Literal(1),
+        IO,
+        RET);
 
-    /* I am tired of saying "subroutine" so many times, so I am going to
-     * introduce a new term.  Remember the goal our language is working
-     * towards -- we want to be able to type a word at the keyboard, and
-     * let the computer look it up in the dictionary and execute the
-     * appropriate code.
-     *
-     * So far we have two named items in the dictionary, call and emit.
-     *
-     * We are going to term a named dictionary item a "word".
-     * This is a Forth tradition.
-     *
-     * So call and emit are "words", or "dictionary words" if you want to be
-     * precise about it.  So far these are the only words we've defined.
-     *
-     * Let's define some more words.
-     */
-
-    /* Our CPU does not have subtraction so let's make subtraction by adding
-     * the two's complement.
-     *
-     * To get the two's complement, do a bitwise invert and add 1.
-     *
-     * This will be the most complicated Forth that we've written so far
-     * so let's walk through step by step. */
+//@ I am tired of saying "subroutine" so many times, so I am going to
+//@ introduce a new term.  Remember the goal our language is working
+//@ towards -- we want to be able to type a word at the keyboard, and
+//@ let the computer look it up in the dictionary and execute the
+//@ appropriate code.
+//@
+//@ So far we have two named items in the dictionary, call and emit.
+//@
+//@ We are going to term a named dictionary item a "word".  This is a
+//@ Forth tradition.  So call and emit are "words", or "dictionary words"
+//@ if you want to be precise about it.  So far these are the only words
+//@ we've defined.
+//@
+//@ Let's define some more words.
+//@
+//@ #### - (subtraction)
+//@
+//@ Our CPU does not have subtraction so let's make subtraction by adding
+//@ the
+//@ [two's complement](https://en.wikipedia.org/wiki/Two%27s_complement).
+//@
+//@ To get the two's complement, do a bitwise invert and add 1.
+//@
+//@ This will be the most complicated Forth that we've written so far
+//@ so let's walk through step by step.
 
     // - ( a b -- a-b )
     d.entry(); d.name(1, *b"-  ");  let sub = d.here;
@@ -1038,429 +1203,484 @@ fn build_dictionary(c: &mut Core) {
         RET         /* Done, return to caller, leaving n on the data stack. */
     );
 
-    /* Writing it out like that takes a lot of space.  Normally Forth code
-     * is written on a single line, like this:
-     *
-     * INV 1 ADD ADD RET
-     *
-     * Looking at it this way, it's easy to see the new word we just
-     * created (-) is made from 5 instructions.  It's pretty typical for
-     * a Forth word to be made of 2-7 of them.  Beyond that length, things
-     * get successively harder to understand, and it becomes a good idea
-     * to split some work off into helper words.
-     *
-     * We will see an example of this below.
-     */
-
-    /* Our next word will be useful for Boolean logic.
-     *
-     * 0= ( n -- f )
-     *
-     * In a stack comment, "f" means "flag", a.k.a. Boolean value.
-     * By Forth convention, zero is false and any nonzero value is true.
-     * However the "best" value to use for a true flag is 65535 (all ones)
-     * so the bitwise logical operations can double as Boolean logical
-     * operations.
-     *
-     * So what 0= does is:
-     * - if n=0,    leave on the stack f=65535
-     * - otherwise, leave on the stack f=0
-     *
-     * It is like C's ! operator.
-     *
-     * In Rust this could be implemented as:
-     *
-     * fn zero_eq(n: u16) -> u16 {
-     *     if (n == 0) {
-     *         return 65535;
-     *     } else {
-     *         return 0;
-     *     }
-     * }
-     *
-     * Rust has an if-then and block scope, so this is easy to write.
-     *
-     * The literal translation to a typical register-machine assembly
-     * language would look something like this:
-     *
-     * zero_eq:     compare r0, 0
-     *              jump_eq is_zero
-     *              move    r0, 0
-     *              ret
-     * is_zero:     move    r0, 65535
-     *              ret
-     *
-     * It looks simple but I want to point out a couple things about it
-     * that are not so simple.
-     *
-     * The conditional jump instruction, jump_eq.
-     * ------------------------------------------
-     * Our CPU doesn't have this.  The only decision-making instruction
-     * we have is Q which is a conditional skip.
-     *
-     * Q - If the top number on the data stack is zero, skip the next
-     * instruction.
-     *
-     * A conditional jump can go anywhere.  A conditional skip can only decide
-     * whether or not to skip the next instruction (i.e., it is a fixed forward
-     * jump of 2 bytes).  You cannot give Q a specific address to jump to, the
-     * way jump_eq worked.
-     *
-     * So our CPU does not make it easy to jump around in a long block of
-     * instructions -- our CPU prefers that you use subroutine calls.
-     *
-     * The forward reference
-     * ---------------------
-     * This is another problem.  Think of the job of an assembler which is
-     * converting an assembly language program to machine code.  We are
-     * currently writing our code in a tiny assembler that we made in Rust!  It
-     * is very simple but so far it has worked for us.  The assembler of our
-     * hypothetical register-machine below has a rather nasty problem to solve.
-     *
-     * zero_eq:     compare r0, 0
-     *              jump_eq is_zero  <----- On this line.
-     *              move    r0, 0
-     *              ret
-     * is_zero:     move    r0, 65535
-     *              ret
-     *
-     * It wants to emit a jump to is_zero, but that symbol has not been seen
-     * yet and is unrecognized.  On top of that, the assembler also doesn't yet
-     * know what address is_zero will have, so doesn't know what jump target to
-     * emit.  To successfully assemble that kind of program you would need an
-     * assembler smarter than the assembler we made for ourselves in Rust.
-     *
-     * There are ways to solve this but let's NOT solve it.
-     *
-     * Our CPU has no jump instruction (only call) and our assembler only lets
-     * us call things we already defined.  Instead of removing these
-     * constraints, find a way to write 0= within the constraints.
-     *
-     * Here is a start at solving the problem
-     *
-     * is_nonzero ( -- 0 )
-     *     Literal(0)
-     *     RET
-     *
-     * 0= ( n -- f )
-     *     Q            <-- pop n, if n=0 skip next instruction
-     *     is_nonzero   <-- f=0 is now pushed to stack
-     *     Literal(0)
-     *     INV          <-- f=65535 is now pushed to stack
-     *     RET          <-- Return
-     *
-     * We got rid of the forward reference by defining is_nonzero before it
-     * was used.
-     *
-     * We got rid of the jump instruction by using a subroutine call instead.
-     *
-     * This code is close to working but it doesn't quite work.  The problem
-     * is that is_nonzero gives control back to 0= when done, just like
-     * a subroutine call normally does, and then 0= runs as normal until it
-     * hits the return instruction at the end.
-     * So we wind up executing both the f=0 branch and the f=65535 branch,
-     * instead of just executing the f=0 branch like we wanted in this case.
-     *
-     * It is possible to fix this last problem by adding the instructions
-     * RTO DRP to is_nonzero.
-     *
-     * is_nonzero ( -- 0 )
-     *     RTO          <-- Pop the return address, push to data stack
-     *     DRP          <-- Discard it
-     *     Literal(0)   <-- Put 0 on the data stack
-     *     RET          <-- Return
-     *
-     * Because we popped off and discarded one item from the return stack, the
-     * final RET instruction will not return to 0= any more.  Instead it will
-     * skip one level and return to whoever called 0=.  This has the result of
-     * ending 0= early, which is what we wanted to do.
-     *
-     * 0= ( n -- f )
-     *     Q            <-- pop n, if n=0 skip next instruction
-     *     is_nonzero   <-- this word puts f=0 on the stack then ends 0= early
-     *     Literal(0)
-     *     INV          <-- f=65535 is now pushed to stack
-     *     RET          <-- Return
-     *
-     * I call this pattern "return-from-caller".  It is used occasionally in
-     * real Forth systems.  My dialect of Forth will use it extensively to work
-     * around my CPU's lack of conditional jump.
-     *
-     * Now we've explained how 0= is going to work, let's write it.
-     */
-
-    /* First we define the helper.  It won't be reused, so I am not going
-     * to bother giving it a dictionary header and name for easy lookup later.
-     * Think of it as a private function. */
+//@ Writing it out like that takes a lot of space.  Normally Forth code
+//@ is written on a single line, like this:
+//@
+//@ ```
+//@ INV 1 ADD ADD RET
+//@ ```
+//@
+//@ Looking at it this way, it's easy to see the new word we just
+//@ created (-) is made from 5 instructions.  It's pretty typical for
+//@ a Forth word to be made of 2-7 of them.  Beyond that length, things
+//@ get successively harder to understand, and it becomes a good idea
+//@ to split some work off into helper words.
+//@
+//@ We will see an example of this below.
+//@
+//@ #### 0= (compare-to-zero)
+//@
+//@ Our next word will be useful for Boolean logic.
+//@
+//@ ```
+//@ 0= ( n -- f )
+//@ ```
+//@
+//@ In a stack comment, "f" means "flag", a.k.a. Boolean value.
+//@ By Forth convention, zero is false and any nonzero value is true.
+//@ However the "best" value to use for a true flag is 65535 (all ones)
+//@ so the bitwise logical operations can double as Boolean logical
+//@ operations.
+//@
+//@ So what 0= does is:
+//@
+//@ - if n=0,    leave on the stack f=65535
+//@ - otherwise, leave on the stack f=0
+//@
+//@ It is like C's ! operator.
+//@
+//@ In Rust this could be implemented as:
+//@
+//@ ```
+//@ // example code, not part of our program
+//@ fn zero_eq(n: u16) -> u16 {
+//@     if (n == 0) {
+//@         return 65535;
+//@     } else {
+//@         return 0;
+//@     }
+//@ }
+//@ ```
+//@
+//@ Rust has an if-then and block scope, so this is easy to write.
+//@
+//@ The literal translation to a typical register-machine assembly
+//@ language would look something like this:
+//@
+//@ ```
+//@ zero_eq:     compare r0, 0
+//@              jump_eq is_zero
+//@              move    r0, 0
+//@              ret
+//@ is_zero:     move    r0, 65535
+//@              ret
+//@ ```
+//@
+//@ It looks simple but I want to point out a couple things about it
+//@ that are not so simple.
+//@
+//@ ##### The conditional jump instruction, jump_eq.
+//@
+//@ Our CPU doesn't have this.  The only decision-making instruction
+//@ we have is Q which is a conditional skip.
+//@
+//@ Q - If the top number on the data stack is zero, skip the next
+//@ instruction.
+//@
+//@ A conditional jump can go anywhere.  A conditional skip can only decide
+//@ whether or not to skip the next instruction (i.e., it is a fixed forward
+//@ jump of 2 bytes).  You cannot give Q a specific address to jump to, the
+//@ way jump_eq worked.
+//@
+//@ So our CPU does not make it easy to jump around in a long block of
+//@ instructions -- our CPU prefers that you use subroutine calls.
+//@
+//@ ##### The forward reference
+//@
+//@ This is another problem.  Think of the job of an assembler which is
+//@ converting an assembly language program to machine code.  We are
+//@ currently writing our code in a tiny assembler that we made in Rust!  It
+//@ is very simple but so far it has worked for us.  The assembler of our
+//@ hypothetical register-machine below has a rather nasty problem to solve.
+//@
+//@ ```
+//@ zero_eq:     compare r0, 0
+//@              jump_eq is_zero  <----- On this line.
+//@              move    r0, 0
+//@              ret
+//@ is_zero:     move    r0, 65535
+//@              ret
+//@ ```
+//@
+//@ It wants to emit a jump to is_zero, but that symbol has not been seen
+//@ yet and is unrecognized.  On top of that, the assembler also doesn't yet
+//@ know what address is_zero will have, so doesn't know what jump target to
+//@ emit.  To successfully assemble that kind of program you would need an
+//@ assembler
+//@ [smarter](https://en.wikipedia.org/wiki/Assembly_language#Two-pass_assembler)
+//@ than the assembler we made for ourselves in Rust.
+//@
+//@ There are ways to solve this but let's NOT solve it.
+//@
+//@ Our CPU has no jump instruction (only call) and our assembler only lets
+//@ us call things we already defined.  Instead of removing these
+//@ constraints, find a way to write 0= within the constraints.
+//@
+//@ Here is a start at solving the problem
+//@
+//@ ```
+//@ is_nonzero ( -- 0 )
+//@     Literal(0)
+//@     RET
+//@
+//@ 0= ( n -- f )
+//@     Q            <-- pop n, if n=0 skip next instruction
+//@     is_nonzero   <-- f=0 is now pushed to stack
+//@     Literal(0)
+//@     INV          <-- f=65535 is now pushed to stack
+//@     RET          <-- Return
+//@ ```
+//@
+//@ We got rid of the forward reference by defining is_nonzero before it
+//@ was used.
+//@
+//@ We got rid of the jump instruction by using a subroutine call instead.
+//@
+//@ This code is close to working but it doesn't quite work.  The problem
+//@ is that is_nonzero gives control back to 0= when done, just like
+//@ a subroutine call normally does, and then 0= runs as normal until it
+//@ hits the return instruction at the end.
+//@ So we wind up executing both the f=0 branch and the f=65535 branch,
+//@ instead of just executing the f=0 branch like we wanted in this case.
+//@
+//@ It is possible to fix this last problem by adding the instructions
+//@ RTO DRP to is_nonzero.
+//@
+//@ ```
+//@ is_nonzero ( -- 0 )
+//@     RTO          <-- Pop the return address, push to data stack
+//@     DRP          <-- Discard it
+//@     Literal(0)   <-- Put 0 on the data stack
+//@     RET          <-- Return
+//@ ```
+//@
+//@ Because we popped off and discarded one item from the return stack, the
+//@ final RET instruction will not return to 0= any more.  Instead it will
+//@ skip one level and return to whoever called 0=.  This has the result of
+//@ ending 0= early, which is what we wanted to do.
+//@
+//@ ```
+//@ 0= ( n -- f )
+//@     Q            <-- pop n, if n=0 skip next instruction
+//@     is_nonzero   <-- this word puts f=0 on the stack then ends 0= early
+//@     Literal(0)
+//@     INV          <-- f=65535 is now pushed to stack
+//@     RET          <-- Return
+//@ ```
+//@
+//@ I call this pattern "return-from-caller".  It is used occasionally in
+//@ real Forth systems.  My dialect of Forth will use it extensively to work
+//@ around my CPU's lack of conditional jump.
+//@
+//@ Now we've explained how 0= is going to work, let's write it.
+//@
+//@ #### 0= (compare-to-zero), for real this time
+//@
+//@ First we define the helper.  It won't be reused, so I am not going
+//@ to bother giving it a dictionary header and name for easy lookup later.
+//@ Think of it as a private function.
 
     let zero = d.here;
     forth!(Literal(0), RTO, DRP, RET);
 
-    /* Now define 0= using the helper. */
+//@ Now define 0= using the helper.
 
     // 0= ( n -- f )
     d.entry(); d.name(2, *b"0= ");  let zero_eq = d.here;
     forth!(Q, zero, Literal(0), INV, RET);
 
-    /* Next let's make a = equality comparison operator, using 0= and subtract.
-     * I call it an "operator" because that's what other languages would
-     * call it, but Forth has no special idea of an "operator".  Everything
-     * is just words. */
+//@ #### = (equals)
+//@
+//@ Next let's make a = equality comparison operator, using 0= and subtract.
+//@ I call it an "operator" because that's what other languages would
+//@ call it, but Forth has no special idea of an "operator".  Everything
+//@ is just words.
 
     // = ( a b -- a=b )
     d.entry(); d.name(1, *b"=  ");  let eq = d.here;
     forth!(sub, zero_eq, RET);
 
-    /* Note that 0= and subtract are both words, not CPU instructions.
-     * This makes = the first "pure" Forth word we have defined, with no
-     * direct dependency on the machine's instruction set.
-     * We could define = as - 0= on a real standards-compliant Forth system
-     * and it would still work.  So Forth gets you to the point of writing
-     * "portable" code really quickly.  Often you can reuse routines early in
-     * bootstrapping even though they were written and tested on a different
-     * machine.  Many languages offer portability but few offer it so quickly.
-     */
-
-    /* -----------------------------------------------------------------------
-     * Part 2a - The lexer
-     *---------------------------------------------------------------------- */
-
-    /* Now that we've got some basics in place let's go back to solving
-     * the real problem of getting our language to read words from the
-     * keyboard.  The first problem we have is that we need some way to
-     * separate words from each other so we know where one word ends and the
-     * next begins.  This problem is called "lexing".  Forth has about the
-     * simplest lexer ever, it just splits on whitespace.  Anything with
-     * character code <=32 is considered whitespace.  Words are delimited by
-     * whitespace.  And that is all the syntax Forth has.
-     *
-     * To read a word from the keyboard you will need to:
-     * 1. Advance past any leading whitespace
-     * 2. Read characters into a buffer until whitespace is seen again.
-     */
-
-    /* Let's start with the "advance past leading whitespace" part
-     *
-     * The "key" word gives us the latest keystroke as an ASCII code.
-     * (Really it is reading utf-8 characters one byte at a time but let's
-     * not get into that right now, pretend the year is 196x, we're sitting
-     * in front of a minicomputer and and utf-8 hasn't been invented yet.)
-     *
-     * ASCII codes 0 to 32 are whitespace or control characters.  Codes
-     * 33 and up are letters, numbers and symbols.  So to skip whitespace
-     * all you need to do is read keys until you get an ASCII code >= 33,
-     * then return that to tell the rest of the program what key code you
-     * saw.
-     *
-     * In Rust this could be implemented as:
-     *
-     * fn skipws() -> u16 {
-     *     loop {
-     *         let c = key();
-     *         if c >= 33 {
-     *             return c;
-     *         }
-     *     }
-     * }
-     *
-     * Rust has a loop keyword, so this is easy to write.
-     * (Alarm bells should be ringing in your head at this point because
-     * we haven't put any looping constructs in our CPU or language.)
-     *
-     * The literal translation to a typical register-machine assembly
-     * language would look something like this:
-     *
-     * skipws:      call key
-     *              compare r0, 32
-     *              jump_le skipws
-     *              ret
-     *
-     * (More alarm bells should be ringing in your head because this is
-     * using conditional jump, which our CPU doesn't have.)
-     *
-     * Like last time, is there a way to solve this without conditional
-     * jump?
-     *
-     * Here is a start at solving the problem:
-     *
-     * skipws ( -- c )
-     *     key          <-- Put keycode on the stack:           ( c )
-     *     DUP          <-- Duplicate top value on the stack:   ( c c )
-     *     Literal(33)  <-- Put 33 on the stack:                ( c c 33 )
-     *     GEQ          <-- Is c >= 33?                         ( c f )
-     *     Q            <-- If so...
-     *     RET          <-- ... return, leaving c on the stack. ( c )
-     *     DRP          <-- Discard c from the stack.           ( )
-     *     skipws       <-- Call skipws again
-     *
-     *  You will notice there is no RET statement at the end of skipws.
-     *  At the end of skipws we call skipws again.  This makes an infinite
-     *  loop.  The only way out of the loop is the RET instruction in the
-     *  middle.  This works similarly to the Rust code that uses a loop { }
-     *  and breaks out when it sees the condition it's looking for.
-     *
-     *  Writing a word that calls itself is called "recursion".
-     *
-     *  This code almost works but there is still something wrong with it.
-     *  Youll notice we were careful to make sure "skipws" removed all items
-     *  it added to the data stack, before it called itself.  Its last two
-     * lines were:
-     *
-     *  DRP    <-- Discard c from the stack
-     *  skipws <-- Call skipws again
-     *
-     *  If we didn't do that, skipws would leave each whitespace character
-     *  it saw, on the data stack, as it looped again and again.
-     *  So instead of returning the first nonwhitespace character it would
-     *  return EVERY character it saw.
-     *
-     * 1st recursion: data stack: ( c1 )
-     * 2nd recursion: data stack: ( c1 c2 )
-     * 3rd recursion: data stack: ( c1 c2 c3 )
-     *
-     * There are problems with this.  It's messy.  The caller has no idea
-     * how many values we are going to leave on the stack, so has no idea
-     * how many to pop off.  Also, we might see more than 16 whitespace
-     * characters in a row, which would make weird things happen because
-     * our CPU's data stack only has room for 16 numbers.
-     *
-     * For these reasons it's better to leave the data stack as we found it,
-     * when we do a recursive call.  That is the reason the last two lines are
-     * DRP, skipws -- it's to stop items building up on the data stack.  The
-     * final pass through this function goes down a different path that does
-     * not DRP, so it leaves something on the data stack -- the last key read.
-     *
-     * The problem skipws still has, is that we haven't taken the same care
-     * with its return stack.
-     *
-     * At the first line of skipws the return stack looks like this:
-     * ( caller )
-     *
-     * That's because skipws must have been called by our CPU's CALL
-     * instruction (we have no other way of calling subroutines!), and the
-     * CALL instruction leaves a return address on the top of the return
-     * stack so RET knows where to return to at the end of the subroutine.
-     *
-     * But we are also using CALL for a different purpose:  to repeat skipws.
-     * Every time we repeat skipws, the CALL instruction will push another
-     * return address to the call stack.
-     *
-     *     DRP                                 return stack:( caller )
-     *     skipws       <-- Call skipws again. return stack:( caller x )
-     *     <-- This location has address x.
-     *
-     * first call:    return stack: ( caller )
-     * 1st recursion: return stack: ( caller x )
-     * 2nd recursion: return stack: ( caller x x )
-     * 3rd recursion: return stack: ( caller x x x )
-     *
-     * Clearly all these x's are garbage.  When we are done with skipws we
-     * want to return to our caller, not to x.
-     *
-     * We could patch over the problem somewhat by putting a RET instruction
-     * at x.
-     *
-     *     DRP                                 return stack:( caller )
-     *     skipws       <-- Call skipws again. return stack:( caller x )
-     *     RET          <-- x
-     *
-     * This yields working recursive code.
-     *
-     * Each time we loop, a useless return address x is left on the return
-     * stack.  When skipws wants to quit, skipws runs a RET instruction, which
-     * transfers control to x.  x is the address of a RET instruction, left on
-     * the stack earler.  So we wind up running RET RET RET ... until we burn
-     * through all x's on the return stack and finally transfer control back to
-     * caller.
-     *
-     * first call:    return stack: ( caller )         data stack: ( )
-     * 1st recursion: return stack: ( caller x )       data stack: ( )
-     * 2nd recursion: return stack: ( caller x x )     data stack: ( )
-     * 3rd recursion: return stack: ( caller x x x )   data stack: ( c )
-     * RET:         : return stack: ( caller x x )     data stack: ( c )
-     * RET:         : return stack: ( caller x )       data stack: ( c )
-     * RET:         : return stack: ( caller )         data stack: ( c )
-     * RET:         < control is passed back to our caller,
-     *                and now they can do stuff with the "c" on the data
-     *                stack, yay >
-     *
-     * This works.  It isn't very fast but we don't care about speed right
-     * now, just about getting our computer to work.
-     *
-     * But there is still a problem.
-     *
-     * Our CPU has a fixed-size circular return stack that can hold 32 numbers.
-     * What happens if you loop 32 times or more?  The return stack fills up
-     * completely with the useless "x" addresses, and the address of caller
-     * is lost.
-     *
-     * recursive call N  :  return stack: ( caller x x x ... x )
-     * recursive call N+1:  return stack: (      x x x x ... x )  :-(
-     *
-     * So skipping 32 or more whitespace characters in a row wouldn't work.
-     * To fix that problem we need to find a way to stop the useless "x"
-     * addresses from building up on the return stack.
-     *
-     * 1st loop: return stack: ( caller )   data stack: ( )
-     * 2nd loop: return stack: ( caller )   data stack: ( )
-     * 3rd loop: return stack: ( caller )   data stack: ( c )
-     * RET:      < control is passed back to our caller >
-     *
-     * The most common solution is called "tail call optimization".
-     * If a function's last instruction is a recursive call, that call can be
-     * replaced with a jump.  On paper this doesn't work very well on our
-     * computer, for two reasons:
-     *
-     * 1. Our CPU has no jump, only call.
-     *
-     * 2. Our assembler, and eventually our interactive environment, would need
-     *    to be smart enough to emit a call sometimes and a jump other times.
-     *    This is the same "look-ahead" problem that we saw with forward
-     *    references -- you don't know that a given CALL will be followed by a
-     *    RET, unless you can see the future.
-     *
-     *    Earlier we decided to keep our assembler very dumb so it would be
-     *    weird to start making it smart now.
-     *
-     * So what are we going to do?
-     *
-     * It is possible to get a very, very dumb caveman version of tail call
-     * optimization, by manually using the "return-from-caller" trick, RTO DRP,
-     * to "get rid of" the x that is pushed on by the skipws CALL.
-     *
-     * skipws ( -- c ) RTO DRP ... Q RET ... skipws
-     *
-     * 1st loop: return stack: ( caller )   data stack: ( )
-     * 2nd loop: return stack: ( )          data stack: ( )
-     * 3rd loop: return stack: ( )          data stack: ( )
-     *
-     * So now recursive calls will leave the return-stack as they found it,
-     * which is good!  We don't have the useless-x problem any more.
-     * Unfortunately, the first pass through skipws discards the original
-     * caller's return address, which we wanted to keep.  There is a quick
-     * hack around that problem: wrap skipws in another subroutine, and
-     * always call it through that wrapper.
-     *
-     * skipws ( -- c ) RTO DRP ... Q RET ... skipws
-     *
-     * wrapper ( -- c ) skipws RET
-     *
-     * The RET in skipws returns from wrapper, but that's ok.
-     *
-     * Finally we are able to write loops, and we did not even need to add
-     * anything to our language or CPU to get that to work, we just needed to
-     * look at things differently.  Learning to look at things differently is a
-     * big part of the Forth philosophy.
-     *
-     * We'll see a better way of solving this problem later, in the file
-     * frustration.4th, but for now this is good enough and we can get back to
-     * solving our original problem, skipping whitespace.
-     */
-
-    /* You should now understand what the next two functions are doing
-     * because we just talked about them at length.  In the real program
-     * I swapped the names of the two functions because I wanted to let the
-     * wrapper have the friendly "skipws" name. */
+//@ Note that 0= and subtract are both words, not CPU instructions.
+//@ This makes = the first "pure" Forth word we have defined, with no
+//@ direct dependency on the machine's instruction set.
+//@ We could define `=` as `-` `0=` on a real standards-compliant Forth system
+//@ and it would still work.  So Forth gets you to the point of writing
+//@ "portable" code really quickly.  Often you can reuse routines early in
+//@ bootstrapping even though they were written and tested on a different
+//@ machine.  Many languages offer portability but few offer it so quickly.
+//@
+//@ ## 2.1 - The lexer
+//@
+//@ Now that we've got some basics in place let's go back to solving
+//@ the real problem of getting our language to read words from the
+//@ keyboard.  The first problem we have is that we need some way to
+//@ separate words from each other so we know where one word ends and the
+//@ next begins.  This problem is called
+//@ "[lexing](https://en.wikipedia.org/wiki/Lexical_analysis)".
+//@ Forth has about the simplest lexer ever, it just splits on whitespace.
+//@ Anything with character code <=32 is considered whitespace.  Words are
+//@ delimited by whitespace.  And that is all the syntax Forth has.
+//@
+//@ To read a word from the keyboard you will need to:
+//@
+//@ 1. Advance past any leading whitespace
+//@ 2. Read characters into a buffer until whitespace is seen again.
+//@
+//@ ### Skipping whitespace
+//@
+//@ Let's start with the "advance past leading whitespace" part
+//@
+//@ The "key" word gives us the latest keystroke as an ASCII code.
+//@ (Really it is reading utf-8 characters one byte at a time but let's
+//@ not get into that right now, pretend the year is 196x, we're sitting
+//@ in front of a minicomputer and and utf-8 hasn't been invented yet.)
+//@
+//@ ASCII codes 0 to 32 are whitespace or control characters.  Codes
+//@ 33 and up are letters, numbers and symbols.  So to skip whitespace
+//@ all you need to do is read keys until you get an ASCII code >= 33,
+//@ then return that to tell the rest of the program what key code you
+//@ saw.
+//@
+//@ In Rust this could be implemented as:
+//@
+//@ ```
+//@ // example code, not part of our program
+//@ fn skipws() -> u16 {
+//@     loop {
+//@         let c = key();
+//@         if c >= 33 {
+//@             return c;
+//@         }
+//@     }
+//@ }
+//@ ```
+//@
+//@ Rust has a loop keyword, so this is easy to write.
+//@ (Alarm bells should be ringing in your head at this point because
+//@ we haven't put any looping constructs in our CPU or language.)
+//@
+//@ The literal translation to a typical register-machine assembly
+//@ language would look something like this:
+//@
+//@ ```
+//@ skipws:      call key
+//@              compare r0, 32
+//@              jump_le skipws
+//@              ret
+//@ ```
+//@
+//@ (More alarm bells should be ringing in your head because this is
+//@ using conditional jump, which our CPU doesn't have.)
+//@
+//@ Like last time, is there a way to solve this without conditional
+//@ jump?
+//@
+//@ Here is a start at solving the problem:
+//@
+//@ ```
+//@ skipws ( -- c )
+//@     key          <-- Put keycode on the stack:           ( c )
+//@     DUP          <-- Duplicate top value on the stack:   ( c c )
+//@     Literal(33)  <-- Put 33 on the stack:                ( c c 33 )
+//@     GEQ          <-- Is c >= 33?                         ( c f )
+//@     Q            <-- If so...
+//@     RET          <-- ... return, leaving c on the stack. ( c )
+//@     DRP          <-- Discard c from the stack.           ( )
+//@     skipws       <-- Call skipws again
+//@ ```
+//@
+//@ You will notice there is no RET statement at the end of skipws.
+//@ At the end of skipws we call skipws again.  This makes an infinite
+//@ loop.  The only way out of the loop is the RET instruction in the
+//@ middle.  This works similarly to the Rust code that uses a loop { }
+//@ and breaks out when it sees the condition it's looking for.
+//@
+//@ Writing a word that calls itself is called
+//@ "[recursion](https://en.wikipedia.org/wiki/Recursive_loop)".
+//@
+//@ This code almost works but there is still something wrong with it.
+//@ Youll notice we were careful to make sure "skipws" removed all items
+//@ it added to the data stack, before it called itself.  Its last two
+//@ lines were:
+//@
+//@ ```
+//@ DRP    <-- Discard c from the stack
+//@ skipws <-- Call skipws again
+//@ ```
+//@
+//@ If we didn't do that, skipws would leave each whitespace character
+//@ it saw, on the data stack, as it looped again and again.
+//@ So instead of returning the first nonwhitespace character it would
+//@ return EVERY character it saw.
+//@
+//@ ```
+//@ 1st recursion: data stack: ( c1 )
+//@ 2nd recursion: data stack: ( c1 c2 )
+//@ 3rd recursion: data stack: ( c1 c2 c3 )
+//@ ```
+//@
+//@ There are problems with this.  It's messy.  The caller has no idea
+//@ how many values we are going to leave on the stack, so has no idea
+//@ how many to pop off.  Also, we might see more than 16 whitespace
+//@ characters in a row, which would make weird things happen because
+//@ our CPU's data stack only has room for 16 numbers.
+//@
+//@ For these reasons it's better to leave the data stack as we found it,
+//@ when we do a recursive call.  That is the reason the last two lines are
+//@ DRP, skipws -- it's to stop items building up on the data stack.  The
+//@ final pass through this function goes down a different path that does
+//@ not DRP, so it leaves something on the data stack -- the last key read.
+//@
+//@ The problem skipws still has, is that we haven't taken the same care
+//@ with its return stack.
+//@
+//@ At the first line of skipws the return stack looks like this:
+//@
+//@ ```
+//@ ( caller )
+//@ ```
+//@
+//@ That's because skipws must have been called by our CPU's CALL
+//@ instruction (we have no other way of calling subroutines!), and the
+//@ CALL instruction leaves a return address on the top of the return
+//@ stack so RET knows where to return to at the end of the subroutine.
+//@
+//@ But we are also using CALL for a different purpose:  to repeat skipws.
+//@ Every time we repeat skipws, the CALL instruction will push another
+//@ return address to the call stack.
+//@
+//@ ```
+//@ DRP                                 return stack:( caller )
+//@ skipws       <-- Call skipws again. return stack:( caller x )
+//@ <-- This location has address x.
+//@
+//@ first call:    return stack: ( caller )
+//@ 1st recursion: return stack: ( caller x )
+//@ 2nd recursion: return stack: ( caller x x )
+//@ 3rd recursion: return stack: ( caller x x x )
+//@ ```
+//@
+//@ Clearly all these x's are garbage.  When we are done with skipws we
+//@ want to return to our caller, not to x.
+//@
+//@ We could patch over the problem somewhat by putting a RET instruction
+//@ at x.
+//@
+//@ ```
+//@ DRP                                 return stack:( caller )
+//@ skipws       <-- Call skipws again. return stack:( caller x )
+//@ RET          <-- x
+//@ ```
+//@
+//@ This yields working recursive code.
+//@
+//@ Each time we loop, a useless return address x is left on the return
+//@ stack.  When skipws wants to quit, skipws runs a RET instruction, which
+//@ transfers control to x.  x is the address of a RET instruction, left on
+//@ the stack earler.  So we wind up running RET RET RET ... until we burn
+//@ through all x's on the return stack and finally transfer control back to
+//@ caller.
+//@
+//@ ```
+//@ first call:    return stack: ( caller )         data stack: ( )
+//@ 1st recursion: return stack: ( caller x )       data stack: ( )
+//@ 2nd recursion: return stack: ( caller x x )     data stack: ( )
+//@ 3rd recursion: return stack: ( caller x x x )   data stack: ( c )
+//@ RET:         : return stack: ( caller x x )     data stack: ( c )
+//@ RET:         : return stack: ( caller x )       data stack: ( c )
+//@ RET:         : return stack: ( caller )         data stack: ( c )
+//@ RET:         < control is passed back to our caller,
+//@                and now they can do stuff with the "c" on the data
+//@                stack, yay >
+//@ ```
+//@
+//@ This works.  It isn't very fast but we don't care about speed right
+//@ now, just about getting our computer to work.
+//@
+//@ But there is still a problem.
+//@
+//@ Our CPU has a fixed-size circular return stack that can hold 32 numbers.
+//@ What happens if you loop 32 times or more?  The return stack fills up
+//@ completely with the useless "x" addresses, and the address of caller
+//@ is lost.
+//@
+//@ ```
+//@ recursive call N  :  return stack: ( caller x x x ... x )
+//@ recursive call N+1:  return stack: (      x x x x ... x )  :-(
+//@ ```
+//@
+//@ So skipping 32 or more whitespace characters in a row wouldn't work.
+//@ To fix that problem we need to find a way to stop the useless "x"
+//@ addresses from building up on the return stack.
+//@
+//@ ```
+//@ 1st loop: return stack: ( caller )   data stack: ( )
+//@ 2nd loop: return stack: ( caller )   data stack: ( )
+//@ 3rd loop: return stack: ( caller )   data stack: ( c )
+//@ RET:      < control is passed back to our caller >
+//@ ```
+//@
+//@ The most common solution is
+//@ "[tail call optimization](https://en.wikipedia.org/wiki/Tail_call)".
+//@ If a function's last instruction is a recursive call, that call can be
+//@ replaced with a jump.  On paper this doesn't work very well on our
+//@ computer, for two reasons:
+//@
+//@ 1. Our CPU has no jump, only call.
+//@
+//@ 2. Our assembler, and eventually our interactive environment, would need
+//@    to be smart enough to emit a call sometimes and a jump other times.
+//@    This is the same "look-ahead" problem that we saw with forward
+//@    references -- you don't know that a given CALL will be followed by a
+//@    RET, unless you can see the future.
+//@
+//@    Earlier we decided to keep our assembler very dumb so it would be
+//@    weird to start making it smart now.
+//@
+//@ So what are we going to do?
+//@
+//@ It is possible to get a very, very dumb caveman version of tail call
+//@ optimization, by manually using the "return-from-caller" trick, RTO DRP,
+//@ to "get rid of" the x that is pushed on by the skipws CALL.
+//@
+//@ ```
+//@ skipws ( -- c ) RTO DRP ... Q RET ... skipws
+//@
+//@ 1st loop: return stack: ( caller )   data stack: ( )
+//@ 2nd loop: return stack: ( )          data stack: ( )
+//@ 3rd loop: return stack: ( )          data stack: ( )
+//@ ```
+//@
+//@ So now recursive calls will leave the return-stack as they found it,
+//@ which is good!  We don't have the useless-x problem any more.
+//@ Unfortunately, the first pass through skipws discards the original
+//@ caller's return address, which we wanted to keep.  There is a quick
+//@ hack around that problem: wrap skipws in another subroutine, and
+//@ always call it through that wrapper.
+//@
+//@ ```
+//@ skipws ( -- c ) RTO DRP ... Q RET ... skipws
+//@
+//@ wrapper ( -- c ) skipws RET
+//@ ```
+//@
+//@ The RET in skipws returns from wrapper, but that's ok.
+//@
+//@ Finally we are able to write loops, and we did not even need to add
+//@ anything to our language or CPU to get that to work, we just needed to
+//@ look at things differently.  Learning to look at things differently is a
+//@ big part of the Forth philosophy.
+//@
+//@ We'll see a better way of solving this problem later, in the file
+//@ frustration.4th, but for now this is good enough and we can get back to
+//@ solving our original problem, skipping whitespace.
+//@
+//@ ### Skipping whitespace (for real this time)
+//@
+//@ You should now understand what the next two functions are doing
+//@ because we just talked about them at length.  In the real program
+//@ I swapped the names of the two functions because I wanted to let the
+//@ wrapper have the friendly "skipws" name.
 
     let skip_helper = d.here;
     forth!(RTO, DRP, key, DUP, Literal(33), GEQ, Q, RET, DRP, skip_helper);
@@ -1469,38 +1689,41 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(6, *b"ski");  let skipws = d.here;
     forth!(skip_helper);
 
-    /* Step 1 of the lexer is now working!
-     * We can now discard whitespace characters typed at the keyboard,
-     * i.e. advance to the first character of a word.
-     */
-
-    /* The next stage of lexing is once again going to be more complicated than
-     * any code we've written before, so we are going to need some more helper
-     * words.
-     *
-     * Until now, we have been able to structure our code in such a way that
-     * the next value we need is conveniently stored at the top of the stack.
-     * The most we've had to do is either DUPlicate this value or DRP it
-     * because it's no longer needed.  In more complicated code, sometimes we
-     * will need to "dig through" the values on the stack to surface the one we
-     * want to use next.  This is inefficient and ugly so we will do it as
-     * little as possible, but it will soon be necessary.
-     *
-     * The CPU instruction SWP does stack shuffling by swapping the first
-     * two values on the data stack.  We already have SWP (it's built into the
-     * CPU) but I will write out its stack effect below as a recap of what it
-     * does.
-     *
-     * SWP ( a b -- b a ).
-     *
-     * The problem with SWP is that it can only reach the top two values
-     * on the stack.  If you wanted to dig further, you couldn't do it with
-     * SWP.
-     *
-     * One way of digging further is by using the RTO and TOR instructions
-     * as demonstrated below in the "over" word.
-     */
-
+//@ Step 1 of the lexer is now working!
+//@ We can now discard whitespace characters typed at the keyboard,
+//@ i.e. advance to the first character of a word.
+//@
+//@ ### Reading characters into a buffer
+//@
+//@ The next stage of lexing is once again going to be more complicated than
+//@ any code we've written before, so we are going to need some more helper
+//@ words.
+//@
+//@ Until now, we have been able to structure our code in such a way that
+//@ the next value we need is conveniently stored at the top of the stack.
+//@ The most we've had to do is either DUPlicate this value or DRP it
+//@ because it's no longer needed.  In more complicated code, sometimes we
+//@ will need to "dig through" the values on the stack to surface the one we
+//@ want to use next.  This is inefficient and ugly so we will do it as
+//@ little as possible, but it will soon be necessary.
+//@
+//@ The CPU instruction SWP does stack shuffling by swapping the first
+//@ two values on the data stack.  We already have SWP (it's built into the
+//@ CPU) but I will write out its stack effect below as a recap of what it
+//@ does.
+//@
+//@ ```
+//@ SWP ( a b -- b a ).
+//@ ```
+//@
+//@ The problem with SWP is that it can only reach the top two values
+//@ on the stack.  If you wanted to dig further, you couldn't do it with
+//@ SWP.
+//@
+//@ One way of digging further is by using the RTO and TOR instructions
+//@ as demonstrated below in the "over" word.
+//@
+//@ #### over
     // over ( a b -- a b a )
     d.entry(); d.name(4, *b"ove");  let over = d.here;
     forth!(TOR,  /* data stack: ( a )      return stack: ( caller b ) */
@@ -1509,74 +1732,81 @@ fn build_dictionary(c: &mut Core) {
            SWP,  /* data stack: ( a b a )  return stack: ( caller ) */
            RET);
 
-    /* "over" is a good building block for further stack shuffling words. */
-
+//@ "over" is a good building block for further stack shuffling words.
+//@
+//@ #### 2dup
     // 2dup ( a b -- a b a b )
     d.entry(); d.name(4, *b"2du");  let twodup = d.here;
     forth!(over, over, RET);
 
-    /* Now we can get back to writing the lexer.  Step 2 of lexing is "Read
-     * characters into a buffer until whitespace is seen again", and once that
-     * works we will be done writing the lexer!
-     *
-     * Start by setting aside the word input buffer.  We'll format it as Nabcde
-     * where N is the number of characters stored.
-     */
+//@ #### The input buffer
+//@
+//@ Now we can get back to writing the lexer.  Step 2 of lexing is "Read
+//@ characters into a buffer until whitespace is seen again", and once that
+//@ works we will be done writing the lexer!
+//@
+//@ Start by setting aside the word input buffer.  We'll format it as Nabcde
+//@ where N is the number of characters stored.
 
     let word_buf = d.here;
     d.allot(6);
 
-    /* It may seem strange to be plopping this down in the middle of the
-     * dictionary but it will work fine, just as long as we're setting aside
-     * an even number of bytes.  As mentioned earlier, if you intersperse
-     * instructions and data in memory...
-     *            _________
-     *  ________ |_________| _____________
-     * |________|    Data   |_____________|
-     * Instructions         More instructions
-     *
-     * ...then you will have to be careful to make sure the second block
-     * of instructions also starts at an even numbered address.
-     * You might need to include an extra byte of data as "padding".
-     *
-     * In this case we set aside one byte for length and five bytes for
-     * characters, which is a total of six bytes, so no padding is needed.
-     */
-
-    /* We are about to do some buffer handling so we want bounds checking.
-     * Let's write a min-value word.  It will look at the top two items
-     * on the stack and return whichever is less.
-     *
-     * This word is simple enough that I'm not going to walk through it
-     * like I did with some of the earlier words.  If you want to understand
-     * how it works I recommend walking through it on paper or in your head.
-     * With a little practice this will become as natural as walking through
-     * code in any other language.
-     */
+//@ It may seem strange to be plopping this down in the middle of the
+//@ dictionary but it will work fine, just as long as we're setting aside
+//@ an even number of bytes.  As mentioned earlier, if you intersperse
+//@ instructions and data in memory...
+//@
+//@ ```
+//@            _________
+//@  ________ |_________| _____________
+//@ |________|    Data   |_____________|
+//@ Instructions         More instructions
+//@ ```
+//@
+//@ ...then you will have to be careful to make sure the second block
+//@ of instructions also starts at an even numbered address.
+//@ You might need to include an extra byte of data as "padding".
+//@
+//@ In this case we set aside one byte for length and five bytes for
+//@ characters, which is a total of six bytes, so no padding is needed.
+//@
+//@ We are about to do some buffer handling so we want bounds checking.
+//@ Let's write a min-value word.  It will look at the top two items
+//@ on the stack and return whichever is less.
+//@
+//@ This word is simple enough that I'm not going to walk through it
+//@ like I did with some of the earlier words.  If you want to understand
+//@ how it works I recommend walking through it on paper or in your head.
+//@ With a little practice this will become as natural as walking through
+//@ code in any other language.
+//@
+//@ #### min
 
     // min ( a b -- n )
     d.entry(); d.name(3, *b"min");  let min = d.here;
     forth!(twodup, GEQ, Q, SWP, DRP, RET);
 
-    /* We want to access the buffer byte-by-byte, but our machine only
-     * accesses memory 16 bits at a time.
-     *
-     * Reading one byte at a time is pretty easy, just do a 16-bit read and
-     * discard the high byte with Literal(0xFF) AND. */
+//@ #### c@ and c! (byte-by-byte memory access)
+//@
+//@ We want to access the buffer byte-by-byte, but our machine only
+//@ accesses memory 16 bits at a time.
+//@
+//@ Reading one byte at a time is pretty easy, just do a 16-bit read and
+//@ discard the high byte with Literal(0xFF) AND.
 
     // c@ ( a -- n )
     d.entry(); d.name(2, *b"c@ ");  let cld = d.here;
     forth!(LD, Literal(0xff), AND, RET);
 
-    /* To write one byte at a time, we'll take the approach of reading two
-     * bytes, editing just the low byte, and then writing the full two-byte
-     * value back to memory.  The high byte gets unnecessarily rewritten but
-     * we are writing back its old value so no one will know the difference.
-     *
-     * If our CPU was multi-core, or had interrupts, there could be some
-     * problems with this approach (search the Internet for "non-atomic
-     * read-modify-write"), but ours isn't, so we are fine.
-     */
+//@ To write one byte at a time, we'll take the approach of reading two
+//@ bytes, editing just the low byte, and then writing the full two-byte
+//@ value back to memory.  The high byte gets unnecessarily rewritten but
+//@ we are writing back its old value so no one will know the difference.
+//@
+//@ If our CPU was multi-core, or had interrupts, there could be some
+//@ problems with this approach (search the Internet for
+//@ "[non-atomic read-modify-write](https://en.wikipedia.org/wiki/Linearizability)"),
+//@ but ours isn't, so we are fine.
 
     // c! ( n a -- )
     d.entry(); d.name(2, *b"c! ");  let cst = d.here;
@@ -1590,6 +1820,10 @@ fn build_dictionary(c: &mut Core) {
            ST,                  /* ( )                    r: ( caller )   */
            RET);
 
+//@ #### Filling the input buffer
+//@
+//@ Now we have everything we need to fill the input buffer one byte at a time:
+
     /* Load 1 letter into the buffer. */
     let stchar = d.here;
     forth!(Literal(word_buf), cld,  /* Retrieve the first byte of the buffer,
@@ -1620,12 +1854,9 @@ fn build_dictionary(c: &mut Core) {
            cst,  /* Store the letter in the buffer */
            RET);
 
-    /* Function to load letters into buffer until whitespace is hit again.
-     * Return the whitespace character that was seen.
-     *
-     * This will tail-recursively call the function we just wrote, until
-     * whitespace is seen again (a character code that is <= 32).
-     */
+//@ Parsing a whole word is not much harder.  Just tail-recursively call
+//@ the function we just wrote, until whitespace is seen again (a character
+//@ code that is <= 32).
 
     let getcs_helper = d.here;
     forth!(RTO, DRP, /* The "return-from-caller" trick */
@@ -1633,20 +1864,23 @@ fn build_dictionary(c: &mut Core) {
            key, DUP, Literal(32), SWP, GEQ, Q, RET,
            getcs_helper);
 
+//@ This also returns the whitespace character that was seen,
+//@ although we won't do much with it.
+
     // getcs ( -- c )
     d.entry(); d.name(5, *b"get");  let getcs = d.here;
     forth!(getcs_helper, RET);
 
-    /* The lexer is almost done, now we'll write the word that the rest of the
-     * program will use to call it.
-     *
-     * This word is named "word".
-     *
-     * First, it clears word_buf by setting its length byte to 0 and
-     * padding out the first three name bytes by setting them to 32 (space).
-     *
-     * Then, reads a word from the keyboard into the word_buf.
-     */
+//@ #### word
+//@
+//@ The lexer is almost done, now we'll write the word that the rest of the
+//@ program will use to call it.
+//@
+//@ This word is named "word".
+//@
+//@ First, it clears word_buf by setting its length byte to 0 and
+//@ padding out the first three name bytes by setting them to 32 (space).
+//@ Then, reads a word from the keyboard into the word_buf.
 
     // word ( -- )
     d.entry(); d.name(4, *b"wor");  let word = d.here;
@@ -1669,25 +1903,27 @@ fn build_dictionary(c: &mut Core) {
                       so drop it */
         RET);
 
-    /* The lexer is now complete: we can read space-delimited words from
-     * the keyboard.
-     *
-     * This took a long while, because we had to figure out how to do things
-     * like branching and looping, while also figuring out how to write the
-     * lexer itself.
-     * But now our dictionary is filled with useful helper words so our next
-     * steps will be faster to write.
-     */
-
-    /* Let's move on to dictionary lookup, so we can do something useful with
-     * the space-delimited words we now know how to read from the keyboard.
-     *
-     * To do dictionary lookup we first need to keep track of where the
-     * dictionary is, so let's teach Forth about the dictionary pointer (dp)
-     * variable that we've so far been tracking in Rust.
-     *
-     * The traditional Forth name for this variable is "latest".
-     */
+//@ The lexer is now complete: we can read space-delimited words from
+//@ the keyboard.
+//@
+//@ This took a long while, because we had to figure out how to do things
+//@ like branching and looping, while also figuring out how to write the
+//@ lexer itself.
+//@ But now our dictionary is filled with useful helper words so our next
+//@ steps will be faster to write.
+//@
+//@ ## 2.2 - Dictionary lookup
+//@
+//@ Let's move on to dictionary lookup, so we can do something useful with
+//@ the space-delimited words we now know how to read from the keyboard.
+//@
+//@ ### latest
+//@
+//@ To do dictionary lookup we first need to keep track of where the
+//@ dictionary is, so let's teach Forth about the dictionary pointer (dp)
+//@ variable that we've so far been tracking in Rust.
+//@
+//@ The traditional Forth name for this variable is "latest".
 
     // latest ( -- a )
     /* Address of "latest" variable.  This variable stores the address of
@@ -1696,29 +1932,32 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(6, *b"lat");  let latest = d.here;
     forth!(Literal(latest_ptr), RET);
 
-    /* Now we will write "find" which is the word that does dictionary
-     * lookup.  Dictionary lookup is a linked list traversal starting
-     * at latest (the end of the dictionary).  For each dictionary entry, we
-     * compare its name against the name that "word" placed in the input
-     * buffer.  If it matches, we return the address of this dictionary entry's
-     * code field.  Otherwise we advance to the previous dictionary entry and
-     * try again.  If we don't match anything before we hit address 0 (the
-     * start of the dictionary) that means the name in the input buffer
-     * was not found in the dictionary.
-     *
-     * The stack effect of find will be:
-     *
-     * find ( -- xt|0 )
-     *
-     * It's time to explain a couple more conventions often used in stack
-     * effect comments:
-     *
-     * - xt is "execution token".  In our Forth, "execution token" just means
-     *   the address of some code.
-     *
-     * - A vertical bar | means "or".  So find will return either an execution
-     *   token, or 0 if no execution token is found.
-     */
+//@ ### find
+//@
+//@ Now we will write "find" which is the word that does dictionary
+//@ lookup.  Dictionary lookup is a linked list traversal starting
+//@ at latest (the end of the dictionary).  For each dictionary entry, we
+//@ compare its name against the name that "word" placed in the input
+//@ buffer.  If it matches, we return the address of this dictionary entry's
+//@ code field.  Otherwise we advance to the previous dictionary entry and
+//@ try again.  If we don't match anything before we hit address 0 (the
+//@ start of the dictionary) that means the name in the input buffer
+//@ was not found in the dictionary.
+//@
+//@ The stack effect of find will be:
+//@
+//@ ```
+//@ find ( -- xt|0 )
+//@ ```
+//@
+//@ It's time to explain a couple more conventions often used in stack
+//@ effect comments:
+//@
+//@ - xt is "execution token".  In our Forth, "execution token" just means
+//@   the address of some code.
+//@
+//@ - A vertical bar | means "or".  So find will return either an execution
+//@   token, or 0 if no execution token is found.
 
     /* Helper word ( a -- f )
      */
@@ -1767,48 +2006,47 @@ fn build_dictionary(c: &mut Core) {
         DUP, matches, Q, matched,    /* Match - return the code address */
         LD, find_helper);            /* Try the next one */
 
-    /* And find itself is just a wrapper around the tail-recursive
-     * find_helper word. */
+//@ And find itself is just a wrapper around the tail-recursive
+//@ find_helper word.
 
     // find ( -- xt|0 )
     d.entry(); d.name(4, *b"fin");  let find = d.here;
     forth!(latest, LD, find_helper);
 
-    /* The ' (quote) word reads the next word from the keyboard and then looks
-     * it up in the dictionary.  It works very similarly to the "address-of"
-     * operator in C.  ' fn in Forth is like &fn in C.
-     */
+//@ ### ' (quote)
+//@
+//@ The ' (quote) word reads the next word from the keyboard and then looks
+//@ it up in the dictionary.  It works very similarly to the "address-of"
+//@ operator in C.  ' fn in Forth is like &fn in C.
 
     // ' ( -- xt|0 )
     d.entry(); d.name(1, *b"'  ");  let quote = d.here;
     forth!(word, find, RET);
 
-    /* -----------------------------------------------------------------------
-     * Part 2b - The outer interpreter
-     *---------------------------------------------------------------------- */
-
-    /* We can now look up a subroutine in the dictionary by typing its name
-     * at the keyboard.
-     *
-     * Remember that an interactive programming environment needs to let you
-     * do two things:
-     *
-     * 1. Call subroutines by typing their name at the keyboard
-     * 2. Define new subroutines in terms of existing ones
-     *
-     * We're also going to succumb to temptation at this point and add a third
-     * feature to our language.
-     *
-     * 3. Push numbers onto the data stack by typing them at the keyboard
-     *
-     * We haven't achieved any of these three goals yet, but we now have all
-     * of the building blocks we need to do so.
-     */
-
-    /* To add words to the dictionary we'll need to keep track of where the
-     * end of the dictionary is, so let's teach Forth about the "here"
-     * variable that we've so far been tracking in Rust.
-     */
+//@ ## 2.3 - The outer interpreter
+//@
+//@ We can now look up a subroutine in the dictionary by typing its name
+//@ at the keyboard.
+//@
+//@ Remember that an interactive programming environment needs to let you
+//@ do two things:
+//@
+//@ 1. Call subroutines by typing their name at the keyboard
+//@ 2. Define new subroutines in terms of existing ones
+//@
+//@ We're also going to succumb to temptation at this point and add a third
+//@ feature to our language.
+//@
+//@ - 3. Push numbers onto the data stack by typing them at the keyboard
+//@
+//@ We haven't achieved any of these three goals yet, but we now have all
+//@ of the building blocks we need to do so.
+//@
+//@ To add words to the dictionary we'll need to keep track of where the
+//@ end of the dictionary is, so let's teach Forth about the "here"
+//@ variable that we've so far been tracking in Rust.
+//@
+//@ ### here
 
     // here ( -- a )
     /* Address of "here" variable.  This variable stores the address of
@@ -1817,28 +2055,30 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(4, *b"her");  let here = d.here;
     forth!(Literal(here_ptr), RET);
 
-    /* Let's talk a little bit about how we are going to make our Forth
-     * interactive.  We want to do one of two things:
-     *
-     * 1. Call subroutines by typing their name at the keyboard
-     * 2. Define new subroutines in terms of existing ones
-     *
-     * Both of these things are structurally similar.  We can solve either
-     * problem by reading a list of words from the keyboard and doing something
-     * with each word.
-     *
-     * First we look up the word in the dictionary, then we either:
-     * 1. Execute it right now        (if we are in interpreting mode).
-     * 2. Append it to the dictionary (if we are in compiling mode).
-     *
-     * Numbers can be handled in a similar way.  If we encounter a number
-     * in interpreting mode, we'll put it on the stack.  If we encounter a
-     * number in compiling mode, we'll compile a LITERAL instruction that
-     * will put the number on the stack when executed.
-     *
-     * It seems a pretty good bet that we'll be able to solve our problem
-     * with an interpreting/compiling mode flag, so let's make one.
-     */
+//@ ### Achieving interactivity
+//@
+//@ Let's talk a little bit about how we are going to make our Forth
+//@ interactive.  We want to do one of two things:
+//@
+//@ 1. Call subroutines by typing their name at the keyboard
+//@ 2. Define new subroutines in terms of existing ones
+//@
+//@ Both of these things are structurally similar.  We can solve either
+//@ problem by reading a list of words from the keyboard and doing something
+//@ with each word.
+//@
+//@ First we look up the word in the dictionary, then we either:
+//@
+//@ 1. Execute it right now        (if we are in interpreting mode).
+//@ 2. Append it to the dictionary (if we are in compiling mode).
+//@
+//@ Numbers can be handled in a similar way.  If we encounter a number
+//@ in interpreting mode, we'll put it on the stack.  If we encounter a
+//@ number in compiling mode, we'll compile a LITERAL instruction that
+//@ will put the number on the stack when executed.
+//@
+//@ It seems a pretty good bet that we'll be able to solve our problem
+//@ with an interpreting/compiling mode flag, so let's make one.
 
     // state ( -- a )
     /* Address of "state" variable.  This variable stores -1 if
@@ -1847,38 +2087,46 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(5, *b"sta");  let state = d.here;
     forth!(Literal(state_ptr), RET);
 
-    /* We need a way of switching between interpreting and compiling mode.
-     *
-     * If you are interpreting, this is easy -- just write 0 to state.
-     *
-     * If you are compiling, it is not so easy to go back into interpreting
-     * mode, because everything you type gets compiled.  There is no way to
-     * execute a word when you are in compiling mode, so you are stuck
-     * compiling forever.
-     *
-     * What if there was a way to execute a word in compiling mode?
-     *
-     * We will define a special category of words called "immediate" words
-     * that are executed whenever they are seen, even if you are in compiling
-     * mode.
-     *
-     * We will mark a word as "immediate" by setting the high bit of the
-     * length byte, in the name field of its dictionary entry.
-     *
-     * ----+---+---+---+---+---+---+---+
-     * | i | n | n | n | n | n | n | n |
-     * ----+---+---+---+---+---+---+---+
-     * - nnnnnnn = length (0 to 127)
-     * - i       = "immediate" bit (1 = immediate, 0 = ordinary)
-     *
-     * Do you remember the bit math in "find" that I told you to not worry
-     * about just yet?
-     *
-     * Literal(0x0080), INV, AND
-     *
-     * This math was masking out the "immediate" flag so it would not interfere
-     * with dictionary lookup.
-     */
+//@ We need a way of switching between interpreting and compiling mode.
+//@
+//@ If you are interpreting, this is easy -- just write 0 to state.
+//@
+//@ If you are compiling, it is not so easy to go back into interpreting
+//@ mode, because everything you type gets compiled.  There is no way to
+//@ execute a word when you are in compiling mode, so you are stuck
+//@ compiling forever.
+//@
+//@ What if there was a way to execute a word in compiling mode?
+//@
+//@ We will define a special category of words called "immediate" words
+//@ that are executed whenever they are seen, even if you are in compiling
+//@ mode.
+//@
+//@ We will mark a word as "immediate" by setting the high bit of the
+//@ length byte, in the name field of its dictionary entry.
+//@
+//@ ```
+//@ ----+---+---+---+---+---+---+---+
+//@ | i | n | n | n | n | n | n | n |
+//@ ----+---+---+---+---+---+---+---+
+//@ ```
+//@
+//@ - nnnnnnn = length (0 to 127)
+//@ - i       = "immediate" bit (1 = immediate, 0 = ordinary)
+//@
+//@ Do you remember the bit math in "find" that I told you to not worry
+//@ about just yet?
+//@
+//@ ```
+//@ Literal(0x0080), INV, AND
+//@ ```
+//@
+//@ This math was
+//@ [masking out](https://en.wikipedia.org/wiki/Bit_mask)
+//@ the "immediate" flag so it would not interfere
+//@ with dictionary lookup.
+//@
+//@ ### immediate
 
     /* Helper function to get the address of the latest dictionary entry */
     let word_addr = d.here;
@@ -1889,8 +2137,10 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(9, *b"imm");
     forth!(word_addr, DUP, LD, Literal(0x0080), OR, SWP, ST, RET);
 
-    /* Now we can define words to switch between interpreting and compiling
-     * mode.  The names [ and ] are traditional Forth names. */
+//@ ### [ and ]
+//@
+//@ Now we can define words to switch between interpreting and compiling
+//@ mode.  The names [ and ] are traditional Forth names.
 
     // [ ( -- )
     d.entry();
@@ -1908,9 +2158,10 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(1 | 0x80, *b"]  ");  let rbracket = d.here;
     forth!(Literal(0), state, ST, RET);
 
-    /* By setting / unsetting a different bit of the name field we can
-     * temporarily hide a word from name lookups.  We will talk more
-     * about this later. */
+//@ ### smudge and unsmudge
+//@
+//@ By setting a different bit of the name field we can temporarily hide a
+//@ word from name lookups.  We will talk more about this later.
 
     // smudge ( -- )
     d.entry(); d.name(6 | 0x80, *b"smu");  let smudge = d.here;
@@ -1920,26 +2171,30 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(8 | 0x80, *b"uns");  let unsmudge = d.here;
     forth!(word_addr, DUP, LD, Literal(0x0040), INV, AND, SWP, ST, RET);
 
-    /* Now let's make a word that appends to the dictionary.
-     * We have had a Rust helper function for this for a long time.
-     * The word below is the same thing but callable from Forth. */
+//@ ### , (comma)
+//@
+//@ Now let's make a word that appends to the dictionary.
+//@ We have had a Rust helper function for this for a long time.
+//@ The word below is the same thing but callable from Forth.
 
     // , ( n -- )
     d.entry(); d.name(1, *b",  "); let comma = d.here;
     forth!(here, LD, ST,
            here, LD, Literal(2), ADD, here, ST, RET);
 
-    /* We will read numbers the same way we read words:  from the input
-     * buffer.  This, incidentally, is why we chose to reserve space for five
-     * characters in the input buffer, even though we only needed to store
-     * three for word lookup.  The largest 16-bit number will fit in five
-     * decimal digits.
-     *
-     * Our numbers will be base-10.  To build up a base-10 number digit by
-     * digit, we'll need to be able to multiply by 10.  Our CPU has no multiply
-     * but it does have bit shift, which can be used to multiply or divide an
-     * unsigned value by any power of two.
-     */
+//@ ### number
+//@
+//@ We will read numbers the same way we read words:  from the input
+//@ buffer.  This, incidentally, is why we chose to reserve space for five
+//@ characters in the input buffer, even though we only needed to store
+//@ three for word lookup.  The largest 16-bit number will fit in five
+//@ decimal digits.
+//@
+//@ Our numbers will be base-10.  To build up a base-10 number digit by
+//@ digit, we'll need to be able to multiply by 10.  Our CPU has no multiply
+//@ but it does have bit shift, which can be used to multiply or divide an
+//@ unsigned value by any power of two.
+
 
     // x10 ( n -- n*10 )
     d.entry(); d.name(3, *b"x10");  let x10 = d.here;
@@ -1948,8 +2203,8 @@ fn build_dictionary(c: &mut Core) {
         ADD, ADD,                  /* (n*8) + n + n = (n*10) */
         RET);
 
-    /* Now we can write a word that goes through the input buffer
-     * character by character and converts it to an integer on the stack. */
+//@ Now we can write a word that goes through the input buffer
+//@ character by character and converts it to an integer on the stack.
 
     /* Helper function to clear junk off the stack. */
     let end_num = d.here;
@@ -1983,6 +2238,14 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(6, *b"num");  let number = d.here;
     forth!(Literal(0), Literal(1), number_helper);
 
+//@ ### literal
+//@
+//@ To compile an integer, we'll want to convert it to a LITERAL
+//@ instruction in the dictionary.  Bear in mind that only numbers 0-32767
+//@ can be directly stored in a LITERAL instruction.  This code makes no
+//@ attempt to automatically perform the LITERAL INV trick -- that's left
+//@ up to the programmer.
+
     /* Compile a number */
     d.entry(); d.name(3, *b"lit");  let lit = d.here;
     forth!(DUP, ADD, Literal(1), ADD, comma, RET);
@@ -2003,6 +2266,10 @@ fn build_dictionary(c: &mut Core) {
         /* and then return-from-caller. */
         RTO, DRP, RET);
 
+//@ Similarly, to compile a word, we'll want to convert from an execution
+//@ token (xt) on the stack to a CALL instruction in the dictionary.
+//@ Unless it's an immediate word, which we need to execute right now.
+
     // Helper function to compile a call ( xt -- xt? )
     let try_compile_call = d.here;
     forth!(
@@ -2038,18 +2305,19 @@ fn build_dictionary(c: &mut Core) {
         /* otherwise, execute it. */
         execute, RET);
 
-    /* Forth can have very good error handling.  This Forth does not.
-     * If we try to look up a word in the dictionary and can't find it,
-     * and if the word also can't be parsed as an number,
-     * then we print out a ? and move on to the next word.
-     *
-     * This helper function does some stack cleanup, prints the ?, then
-     * uses the return-from-caller trick to move on to the next word.
-     */
+//@ Forth can have very good error handling.  This Forth does not.
+//@ If we try to look up a word in the dictionary and can't find it,
+//@ and if the word also can't be parsed as an number,
+//@ then we print out a ? and move on to the next word.
+//@
+//@ This helper function does some stack cleanup, prints the ?, then
+//@ uses the return-from-caller trick to move on to the next word.
+
     let bad = d.here;
     forth!(DRP, Literal(63), emit, RTO, DRP, RET);
 
-    /* Figure out what to do with the contents of the input buffer.  */
+//@ Given all that, here's an all-in-one subroutine that figures out what to do
+//@ with the contents of the input buffer.
 
     // dispatch ( xt -- )
     d.entry(); d.name(9, *b"int");  let dispatch = d.here;
@@ -2064,24 +2332,22 @@ fn build_dictionary(c: &mut Core) {
         /* If it is a number, treat it as a number. */
         try_compile_lit, RET);
 
-    /* And now we can write the main interpreter/compiler loop.
-     *
-     * This is the top-level code for our entire Forth system!
-     *
-     * Forth names this "quit", for the reason that calling "quit" in
-     * the middle of a compiled program is a reasonable way to bring
-     * you back to top-level.
-     *
-     * "quit" is called the "outer interpreter" because it is the outermost
-     * interpreter loop that Forth uses.  Some Forth implementations also
-     * use an "inner interpreter" to execute their threaded code.  Our Forth
-     * does not have an inner interpreter because we used subroutine
-     * threading, making our threaded code a list of subroutine calls that
-     * can be directly executed by the CPU.
-     *
-     * Let's look at what "quit" does.  We've already done all the hard work
-     * so it can be quite short.
-     */
+//@ And now we can write the main interpreter/compiler loop.
+//@ This is the top-level code for our entire Forth system!
+//@ Forth names this "quit", because you'd expect putting the word
+//@ "quit" in the middle of a compiled program to bring you back
+//@ to top-level.
+//@
+//@ "quit" is called the "outer interpreter" because it is the outermost
+//@ interpreter loop that Forth uses.  Some Forth implementations also
+//@ use an "inner interpreter" to execute their threaded code.  Our Forth
+//@ does not have an inner interpreter because we used subroutine
+//@ threading, making our threaded code a list of subroutine calls that
+//@ can be directly executed by the CPU.
+//@
+//@ Let's look at what "quit" does.  We've already done all the hard work
+//@ so it can be quite short.
+
 
     // quit ( -- )
     d.entry(); d.name(4, *b"qui");  let quit = d.here;
@@ -2092,30 +2358,31 @@ fn build_dictionary(c: &mut Core) {
         dispatch, /* Figure out what to do with the word */
 
         quit      /* Repeat forever */
-
-        /* You might have noticed that "quit" isn't tail-recursive -- it
-         * just calls itself normally.  "quit" is never supposed to return
-         * so it doesn't matter for it to properly maintain the return stack.
-         * It will just fill up the circular stack and wrap around.  That's
-         * fine.
-         */
     );
 
-    /* We now have an interpreter that can compile or execute code!!!
-     *
-     * We have now succeeded at:
-     *
-     * 1. Call subroutines by typing their name at the keyboard
-     * 3. Push numbers onto the data stack by typing them at the keyboard
-     *
-     * But there are still a few more words we'll need if we want to:
-     *
-     * 2. Define new subroutines in terms of existing ones
-     *
-     * Let's take care of that now.
-     */
-
-    /* Here is a word to create a new dictionary header. */
+//@ You might have noticed that "quit" isn't tail-recursive -- it
+//@ just calls itself normally.  "quit" is never supposed to return
+//@ so it doesn't matter for it to properly maintain the return stack.
+//@ It will just fill up the circular stack and wrap around.  That's
+//@ fine.
+//@
+//@ We now have an interpreter that can compile or execute code!!!
+//@
+//@ We have now succeeded at:
+//@
+//@ - 1. Call subroutines by typing their name at the keyboard
+//@ - 3. Push numbers onto the data stack by typing them at the keyboard
+//@
+//@ ## 2.4 - Defining subroutines
+//@
+//@ There are still a few more words we'll need if we want to:
+//@
+//@ - 2. Define new subroutines in terms of existing ones
+//@
+//@ Let's take care of that now.
+//@
+//@ #### create
+//@ Here is a word to create a new dictionary header.
 
     // create ( -- )
     d.entry(); d.name(6, *b"cre");  let create = d.here;
@@ -2130,7 +2397,9 @@ fn build_dictionary(c: &mut Core) {
 
         RET);
 
-    /* And now, here is the word to compile a new Forth word. */
+//@ #### : (define word)
+//@
+//@ Here is the word to compile a new Forth word.
 
     // : ( -- )
     d.entry(); d.name(1, *b":  ");
@@ -2146,10 +2415,12 @@ fn build_dictionary(c: &mut Core) {
         rbracket,
         RET);
 
-    /* And here is ;, the "end" marker that ends the Forth word.
-     * Note that ; is immediate, as it has to switch us from compiling mode
-     * back into interpreting mode.
-     */
+//@ #### ; (end of definition)
+//@
+//@ Finally, here is semicolon, the "end" marker that ends the Forth word.
+//@ Note that ; is immediate, as it has to switch us from compiling mode
+//@ back into interpreting mode.
+
     // ; ( -- )
     d.entry(); d.name(1 | 0x80, *b";  ");
     forth!(
@@ -2166,12 +2437,14 @@ fn build_dictionary(c: &mut Core) {
 
         RET);
 
-    /* Put the CPU instructions into dictionary words so we can call them
-     * interactively from Forth.  Instructions that modify the return stack
-     * need special care, because otherwise they will mess up the
-     * wrapper we created for them, instead of acting on the caller
-     * the way they are supposed to.
-     */
+//@ ### Miscellanea
+//@
+//@ Wrap up the CPU instructions into dictionary words so we can call them
+//@ interactively from Forth.  Instructions that modify the return stack
+//@ need special care, because otherwise they will mess up the
+//@ wrapper we created for them, instead of acting on the caller
+//@ the way they are supposed to.
+
     d.entry(); d.name(3, *b"ret"); forth!(RTO, DRP, RET);
     d.entry(); d.name(2, *b">r "); forth!(RTO, SWP, TOR, TOR, RET);
     d.entry(); d.name(2, *b"r> "); forth!(RTO, RTO, SWP, TOR, RET);
@@ -2192,72 +2465,85 @@ fn build_dictionary(c: &mut Core) {
     d.entry(); d.name(3, *b"u>="); forth!(GEQ, RET);
     d.entry(); d.name(2, *b"io "); forth!(IO, RET);
 
-    /* Update Forth's "latest" and "here" variables to match the ones
-     * we've been tracking in Rust.
-     */
+//@ Update Forth's "latest" and "here" variables to match the ones
+//@ we've been tracking in Rust.
+
     d.c.store(latest_ptr, d.dp);
     d.c.store(here_ptr, d.here);
 
-    /* Start out in interpreting mode.
-    */
+//@ Start out in interpreting mode.
+
     d.c.store(state_ptr, 0xffff);
 
-    /* The "entry point" should be the top level interpreter word "quit".
-    */
+//@ Put a call to the outer interpreter at the CPU's
+//@ [reset vector](https://en.wikipedia.org/wiki/Reset_vector).
+
     d.c.store(0, quit);
 }
 
+//@ Finally, start the machine.
+
 fn main() {
     /* Create the machine */
-    let mut c = new_core();
+    let mut c = Core::new();
 
     /* Put the dictionary into memory */
     build_dictionary(&mut c);
 
-    /* Run Forth */
+    /* Start running the CPU from the reset vector */
     c.ip = 0;
     loop {
         c.step();
     }
 }
 
-/* ---------------------------------------------------------------------------
- *           Part 3 - Using the interactive programming environment
- * ------------------------------------------------------------------------- */
-
-/* "The next step is a problem-oriented-language. By permitting
- * the program to dynamically modify its control language, we
- * mark a qualitative change in capability. We also change our
- * attention from the program to the language it implements.
- * This is an important, and dangerous, diversion. For it's
- * easy to lose sight of the problem amidst the beauty of the
- * solution."
- *
- * -- Chuck Moore, "Programming a Problem-Oriented Language", 1970
- */
-
-/* Now we can start programming in "real" Forth, not a weird macro language
- * inside Rust.
- *
- * You can compile our Forth computer with:
- *     rustc frustration.rs
- *
- * You can run our Forth computer with:
- *     ./frustration
- *
- * However, I recommend loading a Forth program (frustration.4th, provided)
- * which does a few more setup steps before letting you loose.
- *
- *     cat frustration.4th - | ./frustration
- *
- * The line above is a good way to run Frustration if you're using Linux.
- * It concatenates together frustration.4th and - (stdin).  This means you
- * can type commands once frustration.4th has been executed.
- *
- * There is a shell script supplied that will do all of the above for you.
- *
- *     bash build.sh
- *
- * Please read frustration.4th if you want to learn more about how to
- * use Forth.
- */
+//@ ## Part 3 - Using the interactive programming environment
+//@
+//@ > "The next step is a problem-oriented-language. By permitting
+//@ > the program to dynamically modify its control language, we
+//@ > mark a qualitative change in capability. We also change our
+//@ > attention from the program to the language it implements.
+//@ > This is an important, and dangerous, diversion. For it's
+//@ > easy to lose sight of the problem amidst the beauty of the
+//@ > solution."
+//@ > 
+//@ > -- Chuck Moore,
+//@ > ["Programming a Problem-Oriented Language"](https://colorforth.github.io/POL.htm),
+//@ > 1970
+//@
+//@
+//@ Now we can start programming in "real" Forth, not a weird macro language
+//@ inside Rust.
+//@
+//@ You can compile our Forth computer with:
+//@
+//@ ```
+//@     rustc frustration.rs
+//@ ```
+//@
+//@ You can run our Forth computer with:
+//@
+//@ ```
+//@     ./frustration
+//@ ```
+//@
+//@ However, I recommend loading a Forth program (frustration.4th, provided)
+//@ which does a few more setup steps before letting you loose.
+//@
+//@ ```
+//@     cat frustration.4th - | ./frustration
+//@ ```
+//@
+//@ The line above is a good way to run Frustration if you're using Linux.
+//@ It concatenates together frustration.4th and - (stdin).  This means you
+//@ can type commands once frustration.4th has been executed.
+//@
+//@ There is a shell script supplied that will do all of the above for you.
+//@
+//@ ```
+//@     bash build.sh
+//@ ```
+//@
+//@ Please read
+//@ [frustration.4th](./frustration.4th)
+//@ if you want to learn more about how to use Forth.