olebedev · May 22, 2023 05:57 · Dec 15, 2022 · Dec 15, 2022
diff --git a/parse-yarn-lock.nix b/parse-yarn-lock.nix
@@ -0,0 +1,129 @@
+# Parse a yarn.lock file using pure Nix
+# yarn.lock v1 files are basically YAML with support for having multiple keys for a single value in a map and without array support.
+# Inspired by https://github.com/yarnpkg/yarn/blob/158d96dce95313d9a00218302631cd263877d164/src/lockfile/parse.js
+
+with builtins;
+let
+  # Add index to a list of elements
+  enumerate = list: genList (i: ({ inherit i; e = elemAt list i; })) (length list);
+  mkToken = type: value: { inherit type value; };
+  parseLockfile = str: let
+    # A Regex that tokenizes a yarn lockfile
+    # I've split up the regex in the various token types
+    newlineRe = "(\r?\n)";
+    commentRe = "#([^\n]+)";
+    # Used for any kind of whitespace and also indentation in an object
+    indentRe = "( +)";
+    # Note that this contains a group for repetition, so the next group is offset.
+    # This is a regex that matches JSON strings, which is the format used.
+    stringRe = "(\"([^\"\\\\]|\\\\[\\\"\\\\/bfnrt]|\\\\u[0-9a-f]{4})+\")";
+    numberRe = "([0-9]+)";
+    booleanRe = "(true|false)";
+    colonRe = "(:)";
+    commaRe = "(,)";
+    # A symbol is a string without quotes
+    symbolRe = "([a-zA-Z\\/.-][^: \n\r,]+)";
+    tokenizeRe = "${newlineRe}|${commentRe}|${indentRe}|${stringRe}|${numberRe}|${booleanRe}|${colonRe}|${commaRe}|${symbolRe}";
+
+    tokenize = split tokenizeRe;
+    convert = token: if isString token then abort "Invalid token ${token}"
+    else if (elemAt token 0) != null then
+      mkToken "newline" null
+    else if (elemAt token 1) != null then
+      mkToken "comment" (elemAt token 1)
+    else if (elemAt token 2) != null then
+      mkToken "indent" (stringLength (elemAt token 2))
+    else if (elemAt token 3) != null then
+      mkToken "string" (fromJSON (elemAt token 3))
+    else if (elemAt token 5) != null then
+      mkToken "number" (fromJSON (elemAt token 5))
+    else if (elemAt token 6) != null then
+      mkToken "boolean" (elemAt token 6) == "true"
+    else if (elemAt token 7) != null then
+      mkToken "colon" null
+    else if (elemAt token 8) != null then
+      mkToken "comma" null
+    else if (elemAt token 9) != null then
+      mkToken "string" (elemAt token 9)
+    else abort "unreachable";
+    unprocessedTokens = map convert (filter (e: e != "") (tokenize str));
+
+    # Filter out comments, and spaces that don't follow a newline
+    tokens = map ({ i, e }: e) (filter ({ i, e }:
+      if e.type == "comment" then
+        # Check if this is the right version lockfile
+        if (match "[[:space:]]*yarn lockfile v[0-9]+[[:space:]]*" e.value) != null && (match "[[:space:]]*yarn lockfile v1[[:space:]]*" e.value) == null
+        then abort "Unsupported lockfile: ${e.value}"
+        else false
+      else
+        !(e.type == "indent" && (elemAt unprocessedTokens (i - 1)).type != "newline")) (enumerate unprocessedTokens));
+
+    get = index: if index < length tokens then elemAt tokens index else { type = "eof"; };
+
+    # Take one or more keys interspersed with commas
+    takeKeys = index: [(get index).value] ++ (if (get (index + 1)).type == "comma" && (get (index + 2)).type == "string" then takeKeys (index + 2) else []);
+
+    # Consume tokens for a single object
+    # Returns 'value' for the object and 'index' for how far we iterated
+    parse = start: indent:
+    let
+      # genericClosure is used here to iterate over the tokens in a non-recursive way,
+      # which would be too slow for the Nix language.
+      # We can't use fold because we need to recurse into nested maps and skip over
+      # the tokens that were consumed.
+      result = genericClosure {
+        startSet = [ { key = start; values = []; } ];
+        operator = { key, ... }:
+        let
+          token = get key;
+          nextToken = get (key + 1);
+          done = [];
+          next = [{ key = key + 1; values = []; }];
+        in
+          if token.type == "eof" then done
+          else if token.type == "newline" then
+            if indent == 0 then
+              next
+            else if nextToken.type != "indent" || nextToken.value != indent then
+              done
+            else [{ key = key + 1; values = []; }]
+          else if token.type == "indent" then
+            if token.value == indent then next else done
+          # String means this is a key value pair
+          else if token.type == "string" then
+            let
+              keys = takeKeys key;
+              skip = 1 + ((length keys) - 1) * 2;
+              nextToken = get (key + skip);
+            in
+              # If the key is followed by a colon then this is a nested object
+              if nextToken.type == "colon" then
+                let
+                  # Parse the nested object
+                  res = parse (key + skip + 1) (indent + 2);
+                  inherit (res) value index;
+                in
+                  [{
+                    key = index;
+                    values = map (name: { inherit name value; }) keys;
+                  }]
+              # The only valid values
+              else if (nextToken.type == "string" || nextToken.type == "number" || nextToken.type == "boolean") then
+                [{
+                  key = (key + skip + 1);
+                  values = map (name: { inherit name; value = nextToken.value; }) keys;
+                }]
+              else abort "Invalid token ${nextToken.type}"
+          else abort "Invalid token ${token.type}"
+        ;
+      };
+      results = concatLists (map (el: el.values) result);
+    in
+      {
+        value = listToAttrs results;
+        index = (elemAt result ((length result) - 1)).key;
+      };
+  in
+    (parse 0 0).value;
+in
+  parseLockfile (readFile ./yarn.lock)