DmitrySoshnikov · March 27, 2024 07:24 · Sep 1, 2017 · Sep 1, 2017 · Sep 7, 2015 · Sep 6, 2015
diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -28,7 +28,7 @@
  * of `A`, and whenever we have symbol `A` on the stack, and symbol `a` in
  * the buffer, we should use `A -> aB` production. If instead we have a
  * non-terminal on the right hand side, like e.g. in `A -> BC`, then in order
- * to calculate first set of `A`, we should calculate first set of `B`, and
+ * to calculate first set of `A`, we should calculate first set of `BC`, and
  * then merge it to `A`.
  *
  * Follow sets:

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -1,6 +1,9 @@
 /**
  * LL(1) parser. Building parsing table, part 1: First and Follow sets.
  *
+ * NOTICE: see full implementation in the Syntax tool, here:
+ * https://github.com/DmitrySoshnikov/syntax/blob/master/src/sets-generator.js
+ *
  * by Dmitry Soshnikov <[email protected]>
  * MIT Style License
  *

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -142,7 +142,7 @@ function getProductionsForSymbol(symbol) {
 }
 
 /**
- * Given production `S -> F`, returns `F`.
+ * Given production `S -> F`, returns `S`.
  */
 function getLHS(production) {
   return production.split('->')[0].replace(/\s+/g, '');

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -1,19 +1,42 @@
 /**
  * LL(1) parser. Building parsing table, part 1: First and Follow sets.
- * 
+ *
+ * by Dmitry Soshnikov <[email protected]>
+ * MIT Style License
+ *
  * An LL(1)-parser is a top-down, fast predictive non-recursive parser,
  * which uses a state-machine parsing table instead of recursive calls
  * to productions as in a recursive descent parser.
  *
  * We described the work of such a parser in:
  * https://gist.github.com/DmitrySoshnikov/29f7a9425cdab69ea68f
- * 
+ *
  * There we used manually pre-built parsing table. In this diff we implement
  * an automatic solution for generating a parsing table, and consider the
  * first part of it: building First and Follow sets.
  *
- * by Dmitry Soshnikov <[email protected]>
- * MIT Style License
+ * First and Follow sets are used to determine which next production to use
+ * if we have symbol `A` on the stack, and symbol `a` in the buffer.
+ *
+ * First sets:
+ *
+ * First sets are everything that stands in the first position in a derivation.
+ * If we have a production `A -> aB`, then the symbol `a` is in the first set
+ * of `A`, and whenever we have symbol `A` on the stack, and symbol `a` in
+ * the buffer, we should use `A -> aB` production. If instead we have a
+ * non-terminal on the right hand side, like e.g. in `A -> BC`, then in order
+ * to calculate first set of `A`, we should calculate first set of `B`, and
+ * then merge it to `A`.
+ *
+ * Follow sets:
+ *
+ * Follow sets are used when a symbol can be `ε` ("empty" symbol known as
+ * epsilon). In a productions like: `A -> bXa`, if `X` can be `ε`, then it'll
+ * be eliminated, and we still will be able to derive `a` which follows `X`.
+ * So we say that `a` is in follow set of `X`.
+ *
+ * These are the basic rules. We'll cover all the details in the implementation
+ * below.
  */
 
 // Special "empty" symbol.

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -38,7 +38,7 @@ var followSets = {};
 
 function buildFirstSets(grammar) {
   firstSets = {};
-  buildSet(firstOf, firstSets);
+  buildSet(firstOf);
 }
 
 function firstOf(symbol) {
@@ -146,7 +146,7 @@ function getRHS(production) {
 
 function buildFollowSets(grammar) {
   followSets = {};
-  buildSet(followOf, followSets);
+  buildSet(followOf);
 }
 
 function followOf(symbol) {
@@ -211,7 +211,7 @@ function followOf(symbol) {
   return follow;
 }
 
-function buildSet(builder, set) {
+function buildSet(builder) {
   for (var k in grammar) {
     builder(grammar[k][0]);
   }

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -1,13 +1,13 @@
 /**
  * LL(1) parser. Building parsing table, part 1: First and Follow sets.
+ * 
+ * An LL(1)-parser is a top-down, fast predictive non-recursive parser,
+ * which uses a state-machine parsing table instead of recursive calls
+ * to productions as in a recursive descent parser.
  *
- * We described the work of a top-down LL(1) parser in:
+ * We described the work of such a parser in:
  * https://gist.github.com/DmitrySoshnikov/29f7a9425cdab69ea68f
  * 
- * LL(1)-parser is a fast predictive non-recursive parser, which uses
- * a state-machine parsing table instead of recursive calls for productions
- * as in a recursive descent parser.
- * 
  * There we used manually pre-built parsing table. In this diff we implement
  * an automatic solution for generating a parsing table, and consider the
  * first part of it: building First and Follow sets.

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -4,7 +4,7 @@
  * We described the work of a top-down LL(1) parser in:
  * https://gist.github.com/DmitrySoshnikov/29f7a9425cdab69ea68f
  * 
- * LL(1)-parser is a fast predictive, non-recursive, parser, which uses
+ * LL(1)-parser is a fast predictive non-recursive parser, which uses
  * a state-machine parsing table instead of recursive calls for productions
  * as in a recursive descent parser.
  * 

diff --git a/LL1-parser-first-follow-sets.js b/LL1-parser-first-follow-sets.js
@@ -1,12 +1,16 @@
 /**
  * LL(1) parser. Building parsing table, part 1: First and Follow sets.
  *
- * We described work of a top-down LL(1) parser in:
+ * We described the work of a top-down LL(1) parser in:
  * https://gist.github.com/DmitrySoshnikov/29f7a9425cdab69ea68f
- *
- * There we used manually pre-built parsing table. In this diff we
- * implement an automatic solution for generating a parsing table, and
- * consider the first part of it: building First and Follow sets.
+ * 
+ * LL(1)-parser is a fast predictive, non-recursive, parser, which uses
+ * a state-machine parsing table instead of recursive calls for productions
+ * as in a recursive descent parser.
+ * 
+ * There we used manually pre-built parsing table. In this diff we implement
+ * an automatic solution for generating a parsing table, and consider the
+ * first part of it: building First and Follow sets.
  *
  * by Dmitry Soshnikov <[email protected]>
  * MIT Style License

diff --git a/LL1-parser-first-follow.sets.js → LL1-parser-first-follow-sets.js b/LL1-parser-first-follow.sets.js → LL1-parser-first-follow-sets.js
diff --git a/ll1-parser-first-follow.sets.js → LL1-parser-first-follow.sets.js b/ll1-parser-first-follow.sets.js → LL1-parser-first-follow.sets.js
diff --git a/ll1-parser-first-follow.sets.js b/ll1-parser-first-follow.sets.js
@@ -0,0 +1,362 @@
+/**
+ * LL(1) parser. Building parsing table, part 1: First and Follow sets.
+ *
+ * We described work of a top-down LL(1) parser in:
+ * https://gist.github.com/DmitrySoshnikov/29f7a9425cdab69ea68f
+ *
+ * There we used manually pre-built parsing table. In this diff we
+ * implement an automatic solution for generating a parsing table, and
+ * consider the first part of it: building First and Follow sets.
+ *
+ * by Dmitry Soshnikov <[email protected]>
+ * MIT Style License
+ */
+
+// Special "empty" symbol.
+var EPSILON = "ε";
+
+var firstSets = {};
+var followSets = {};
+
+/**
+ * Rules for First Sets
+ *
+ * - If X is a terminal then First(X) is just X!
+ * - If there is a Production X → ε then add ε to first(X)
+ * - If there is a Production X → Y1Y2..Yk then add first(Y1Y2..Yk) to first(X)
+ * - First(Y1Y2..Yk) is either
+ *     - First(Y1) (if First(Y1) doesn't contain ε)
+ *     - OR (if First(Y1) does contain ε) then First (Y1Y2..Yk) is everything
+ *       in First(Y1) <except for ε > as well as everything in First(Y2..Yk)
+ *     - If First(Y1) First(Y2)..First(Yk) all contain ε then add ε
+ *       to First(Y1Y2..Yk) as well.
+ */
+
+function buildFirstSets(grammar) {
+  firstSets = {};
+  buildSet(firstOf, firstSets);
+}
+
+function firstOf(symbol) {
+
+  // A set may already be built from some previous analysis
+  // of a RHS, so check whether it's already there and don't rebuild.
+  if (firstSets[symbol]) {
+    return firstSets[symbol];
+  }
+
+  // Else init and calculate.
+  var first = firstSets[symbol] = {};
+
+  // If it's a terminal, its first set is just itself.
+  if (isTerminal(symbol)) {
+    first[symbol] = true;
+    return firstSets[symbol];
+  }
+
+  var productionsForSymbol = getProductionsForSymbol(symbol);
+  for (var k in productionsForSymbol) {
+    var production = getRHS(productionsForSymbol[k]);
+
+    for (var i = 0; i < production.length; i++) {
+      var productionSymbol = production[i];
+
+      // Epsilon goes to the first set.
+      if (productionSymbol === EPSILON) {
+        first[EPSILON] = true;
+        break;
+      }
+
+      // Else, the first is a non-terminal,
+      // then first of it goes to first of our symbol
+      // (unless it's an epsilon).
+      var firstOfNonTerminal = firstOf(productionSymbol);
+
+      // If first non-terminal of the RHS production doesn't
+      // contain epsilon, then just merge its set with ours.
+      if (!firstOfNonTerminal[EPSILON]) {
+        merge(first, firstOfNonTerminal);
+        break;
+      }
+
+      // Else (we got epsilon in the first non-terminal),
+      //
+      //   - merge all except for epsilon
+      //   - eliminate this non-terminal and advance to the next symbol
+      //     (i.e. don't break this loop)
+      merge(first, firstOfNonTerminal, [EPSILON]);
+      // don't break, go to the next `productionSymbol`.
+    }
+  }
+
+  return first;
+}
+
+/**
+ * We have the following data structure for our grammars:
+ *
+ * var grammar = {
+ *   1: 'S -> F',
+ *   2: 'S -> (S + F)',
+ *   3: 'F -> a',
+ * };
+ *
+ * Given symbol `S`, the function returns `S -> F`,
+ * and `S -> (S + F)` productions.
+ */
+function getProductionsForSymbol(symbol) {
+  var productionsForSymbol = {};
+  for (var k in grammar) {
+    if (grammar[k][0] === symbol) {
+      productionsForSymbol[k] = grammar[k];
+    }
+  }
+  return productionsForSymbol;
+}
+
+/**
+ * Given production `S -> F`, returns `F`.
+ */
+function getLHS(production) {
+  return production.split('->')[0].replace(/\s+/g, '');
+}
+
+/**
+ * Given production `S -> F`, returns `F`.
+ */
+function getRHS(production) {
+  return production.split('->')[1].replace(/\s+/g, '');
+}
+
+/**
+ * Rules for Follow Sets
+ *
+ * - First put $ (the end of input marker) in Follow(S) (S is the start symbol)
+ * - If there is a production A → aBb, (where a can be a whole string)
+ *   then everything in FIRST(b) except for ε is placed in FOLLOW(B).
+ * - If there is a production A → aB, then everything in
+ *   FOLLOW(A) is in FOLLOW(B)
+ * - If there is a production A → aBb, where FIRST(b) contains ε,
+ *   then everything in FOLLOW(A) is in FOLLOW(B)
+ */
+
+function buildFollowSets(grammar) {
+  followSets = {};
+  buildSet(followOf, followSets);
+}
+
+function followOf(symbol) {
+
+  // If was already calculated from some previous run.
+  if (followSets[symbol]) {
+    return followSets[symbol];
+  }
+
+  // Else init and calculate.
+  var follow = followSets[symbol] = {};
+
+  // Start symbol always contain `$` in its follow set.
+  if (symbol === START_SYMBOL) {
+    follow['$'] = true;
+  }
+
+  // We need to analyze all productions where our
+  // symbol is used (i.e. where it appears on RHS).
+  var productionsWithSymbol = getProductionsWithSymbol(symbol);
+  for (var k in productionsWithSymbol) {
+    var production = productionsWithSymbol[k];
+    var RHS = getRHS(production);
+
+    // Get the follow symbol of our symbol.
+    var symbolIndex = RHS.indexOf(symbol);
+    var followIndex = symbolIndex + 1;
+
+    // We need to get the following symbol, which can be `$` or
+    // may contain epsilon in its first set. If it contains epsilon, then
+    // we should take the next following symbol: `A -> aBCD`: if `C` (the
+    // follow of `B`) can be epsilon, we should consider first of `D` as well
+    // as the follow of `B`.
+
+    while (true) {
+
+      if (followIndex === RHS.length) { // "$"
+        var LHS = getLHS(production);
+        if (LHS !== symbol) { // To avoid cases like: B -> aB
+          merge(follow, followOf(LHS));
+        }
+        break;
+      }
+
+      var followSymbol = RHS[followIndex];
+
+      // Follow of our symbol is anything in the first of the following symbol:
+      // followOf(symbol) is firstOf(followSymbol), except for epsilon.
+      var firstOfFollow = firstOf(followSymbol);
+
+      // If there is no epsilon, just merge.
+      if (!firstOfFollow[EPSILON]) {
+        merge(follow, firstOfFollow);
+        break;
+      }
+
+      merge(follow, firstOfFollow, [EPSILON]);
+      followIndex++;
+    }
+  }
+
+  return follow;
+}
+
+function buildSet(builder, set) {
+  for (var k in grammar) {
+    builder(grammar[k][0]);
+  }
+}
+
+/**
+ * Finds productions where a non-terminal is used. E.g., for the
+ * symbol `S` it finds production `(S + F)`, and for the symbol `F`
+ * it finds productions `F` and `(S + F)`.
+ */
+function getProductionsWithSymbol(symbol) {
+  var productionsWithSymbol = {};
+  for (var k in grammar) {
+    var production = grammar[k];
+    var RHS = getRHS(production);
+    if (RHS.indexOf(symbol) !== -1) {
+      productionsWithSymbol[k] = production;
+    }
+  }
+  return productionsWithSymbol;
+}
+
+function isTerminal(symbol) {
+  return !/[A-Z]/.test(symbol);
+}
+
+function merge(to, from, exclude) {
+  exclude || (exclude = []);
+  for (var k in from) {
+    if (exclude.indexOf(k) === -1) {
+      to[k] = from[k];
+    }
+  }
+}
+
+function printGrammar(grammar) {
+  console.log('Grammar:\n');
+  for (var k in grammar) {
+    console.log('  ', grammar[k]);
+  }
+  console.log('');
+}
+
+function printSet(name, set) {
+  console.log(name + ': \n');
+  for (var k in set) {
+    console.log('  ', k, ':', Object.keys(set[k]));
+  }
+  console.log('');
+}
+
+// Testing
+
+// --------------------------------------------------------------------------
+// Example 1 of a simple grammar, generates: a, or (a + a), etc.
+// --------------------------------------------------------------------------
+
+var grammar = {
+  1: 'S -> F',
+  2: 'S -> (S + F)',
+  3: 'F -> a',
+};
+
+var START_SYMBOL = 'S';
+
+printGrammar(grammar);
+
+buildFirstSets(grammar);
+printSet('First sets', firstSets);
+
+buildFollowSets(grammar);
+printSet('Follow sets', followSets);
+
+// Results:
+
+// Grammar:
+//
+//    S -> F
+//    S -> (S + F)
+//    F -> a
+//
+// First sets:
+//
+//    S : [ 'a', '(' ]
+//    F : [ 'a' ]
+//    a : [ 'a' ]
+//    ( : [ '(' ]
+//
+// Follow sets:
+//
+//    S : [ '$', '+' ]
+//    F : [ '$', '+', ')' ]
+
+// --------------------------------------------------------------------------
+// Example 2 of a "calculator" grammar (with removed left recursion, which
+// is replaced with a right recursion and epsilons), it generates language
+// for e.g. (a + a) * a.
+// --------------------------------------------------------------------------
+
+var grammar = {
+  1: 'E -> TX',
+  2: 'X -> +TX',
+  3: 'X -> ε',
+  4: 'T -> FY',
+  5: 'Y -> *FY',
+  6: 'Y -> ε',
+  7: 'F -> a',
+  8: 'F -> (E)',
+};
+
+var START_SYMBOL = 'E';
+
+printGrammar(grammar);
+
+buildFirstSets(grammar);
+printSet('First sets', firstSets);
+
+buildFollowSets(grammar);
+printSet('Follow sets', followSets);
+
+// Results:
+
+// Grammar:
+//
+//    E -> TX
+//    X -> +TX
+//    X -> ε
+//    T -> FY
+//    Y -> *FY
+//    Y -> ε
+//    F -> a
+//    F -> (E)
+//
+// First sets:
+//
+//    E : [ 'a', '(' ]
+//    T : [ 'a', '(' ]
+//    F : [ 'a', '(' ]
+//    a : [ 'a' ]
+//    ( : [ '(' ]
+//    X : [ '+', 'ε' ]
+//    + : [ '+' ]
+//    Y : [ '*', 'ε' ]
+//    * : [ '*' ]
+//
+// Follow sets:
+//
+//    E : [ '$', ')' ]
+//    X : [ '$', ')' ]
+//    T : [ '+', '$', ')' ]
+//    Y : [ '+', '$', ')' ]
+//    F : [ '*', '+', '$', ')' ]