pkoppstein · December 31, 2018 23:24 · Dec 31, 2018 · Dec 31, 2018 · Dec 28, 2018 · Dec 28, 2018
diff --git a/fromcsvfile.jq b/fromcsvfile.jq
@@ -1,5 +1,5 @@
 # Copyright (C) 2018 [email protected]
-# License: Creative Commons Attribution-NonCommerical-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
+# License: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
 # See http://creativecommons.org/licenses/by-nc/3.0/
 # Attribution shall include the copyright notice above.
 

diff --git a/fromcsvfile.jq b/fromcsvfile.jq
@@ -3,21 +3,22 @@
 # See http://creativecommons.org/licenses/by-nc/3.0/
 # Attribution shall include the copyright notice above.
 
-# fromcsvfile.jq version: 0.3 of 2018-12-27
+# fromcsv.jq version: 0.4 of 2018-12-30
 # Requires: jq with `inputs`
 # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2
 
 # A PEG-inspired parser for reading CSV files without the need to "slurp" them.
 
 # The parser is intended to handle a wide variety of "edge cases".
+# Note that both \r\n and \r\r\n are interpreted as end-of-record.
 
 # Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.
 
 # Main jq filters: 
 #  fromcsv/0     # JSON string input
 #  fromcsvfile/0 # read from STDIN
 
-# Example usage: jq -nRc 'include "fromcsvfile"; fromcsfile' MYFILE.csv
+# Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv
 
 ######### PEG machinery
 
@@ -40,7 +41,7 @@ def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");
 
 # end-of-record if end-of-record characters or else at eof
 def EOR:
-  consume("\n|\r\n")
+  consume("\n|\r\n|\r\r\n")
   // (if .remainder=="" then . else empty end) ;
 
 # Internal double-quotes must be doubled; 
@@ -57,7 +58,9 @@ def unquoted_field:
   ;  
 
 def quoted_field_continue:
-  .remainder += ("\n" + input)
+  def trim: sub("(\r\r|\r)$";"");
+  .remainder += ("\n" + input | trim)
+  # | (.record|debug) as $debug
   | (field_content_quoted | consume("\" *"))
     // quoted_field_continue
     ;
@@ -83,18 +86,20 @@ def fields:
 # i.e. field fields
 def record: field | (fields // .) ;
 
-## Loop for processing all the records
-def _fromcsv:
-  if at_eof then empty
-  else (EOR // .)
-  | record
-  | select(.result)
-  | .result,
-    (.result = null | .record+=1 |  _fromcsv)
-  end ;
-
 def fromcsv:
+  ## Loop for processing all the records
+  def _fromcsv:
+    if at_eof then empty
+    else (EOR // .)
+    | record
+    | select(.result)
+    | .result,
+      (.result = null | .record+=1 |  _fromcsv)
+    end ;
+
   {record:0, remainder: .}
   | _fromcsv ;
 
-def fromcsvfile: inputs | fromcsv;
+def fromcsvfile:
+  def trim: sub("(\r\r|\r)$";"");
+  inputs | trim | fromcsv;
diff --git a/fromcsvfile.jq b/fromcsvfile.jq
@@ -3,7 +3,7 @@
 # See http://creativecommons.org/licenses/by-nc/3.0/
 # Attribution shall include the copyright notice above.
 
-# fromcsv.jq version: 0.3 of 2018-12-27
+# fromcsvfile.jq version: 0.3 of 2018-12-27
 # Requires: jq with `inputs`
 # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2
 

diff --git a/fromcsvfile.jq b/fromcsvfile.jq
@@ -1,6 +1,11 @@
-# Author: [email protected]
-# Version: 0.2 of 2018-12-26
+# Copyright (C) 2018 [email protected]
+# License: Creative Commons Attribution-NonCommerical-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
+# See http://creativecommons.org/licenses/by-nc/3.0/
+# Attribution shall include the copyright notice above.
+
+# fromcsv.jq version: 0.3 of 2018-12-27
 # Requires: jq with `inputs`
+# Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2
 
 # A PEG-inspired parser for reading CSV files without the need to "slurp" them.
 
@@ -12,7 +17,7 @@
 #  fromcsv/0     # JSON string input
 #  fromcsvfile/0 # read from STDIN
 
-# Example usage: jq -nR 'include "fromcsvfile"; fromcsfile' MYFILE.csv
+# Example usage: jq -nRc 'include "fromcsvfile"; fromcsfile' MYFILE.csv
 
 ######### PEG machinery
 
@@ -51,11 +56,17 @@ def unquoted_field:
   // parse("[^\",\n]*") # possibly empty
   ;  
 
+def quoted_field_continue:
+  .remainder += ("\n" + input)
+  | (field_content_quoted | consume("\" *"))
+    // quoted_field_continue
+    ;
+
 # Ignore blanks before and after the enclosing quotation marks # EXTENSION
 def quoted_field:
-  consume(" *\"") 
-  | field_content_quoted
-  | consume("\" *") ;
+  consume(" *\"")
+  | ( (field_content_quoted | consume("\" *"))
+      // quoted_field_continue ) ;
 
 def field: 
   quoted_field
@@ -86,15 +97,4 @@ def fromcsv:
   {record:0, remainder: .}
   | _fromcsv ;
 
-def fromcsvfile:
-  def finish:
-    def checkline: test( "^((, *\"(\"\"|[^\"]*)\" *)|,[^\",\n]*|( *\"(\"\"|[^\"]*)\" *)|[^\",\n]+)*$" );
-    if checkline then .
-    else (input // "") as $in
-    | if $in != "" then (. + "\n" + $in | finish)
-      else .
-      end
-    end;
-
-  inputs as $in 
-  | ($in|finish) | fromcsv;
+def fromcsvfile: inputs | fromcsv;
diff --git a/fromcsvfile.jq b/fromcsvfile.jq
@@ -1,5 +1,5 @@
 # Author: [email protected]
-# Version: 0.1 of 2018-12-23
+# Version: 0.2 of 2018-12-26
 # Requires: jq with `inputs`
 
 # A PEG-inspired parser for reading CSV files without the need to "slurp" them.
@@ -26,7 +26,7 @@ def parse($re):
   # on failure, match yields empty
   (.remainder | match("^" + $re)) as $match
   | .remainder |= .[$match.length :]
-  |.result += [$match.string]  ;
+  |.result += [$match.string | gsub("\"\"";"\"")] ;
 
 # Utility function as there is no EOF marker
 def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");
@@ -88,7 +88,7 @@ def fromcsv:
 
 def fromcsvfile:
   def finish:
-    def checkline: match( "^(( *\"(\"\"|[^\"]*)\" *)|[^\",\n]*|[ ,]+)*$") // false;
+    def checkline: test( "^((, *\"(\"\"|[^\"]*)\" *)|,[^\",\n]*|( *\"(\"\"|[^\"]*)\" *)|[^\",\n]+)*$" );
     if checkline then .
     else (input // "") as $in
     | if $in != "" then (. + "\n" + $in | finish)

diff --git a/fromcsvfile.jq b/fromcsvfile.jq
@@ -0,0 +1,100 @@
+# Author: [email protected]
+# Version: 0.1 of 2018-12-23
+# Requires: jq with `inputs`
+
+# A PEG-inspired parser for reading CSV files without the need to "slurp" them.
+
+# The parser is intended to handle a wide variety of "edge cases".
+
+# Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.
+
+# Main jq filters: 
+#  fromcsv/0     # JSON string input
+#  fromcsvfile/0 # read from STDIN
+
+# Example usage: jq -nR 'include "fromcsvfile"; fromcsfile' MYFILE.csv
+
+######### PEG machinery
+
+# consume a regular expression rooted at the start of .remainder
+def consume($re):
+  # on failure, match yields empty
+  (.remainder | match("^" + $re)) as $match
+  | .remainder |= .[$match.length :] ;
+
+def parse($re):
+  # on failure, match yields empty
+  (.remainder | match("^" + $re)) as $match
+  | .remainder |= .[$match.length :]
+  |.result += [$match.string]  ;
+
+# Utility function as there is no EOF marker
+def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");
+
+############ Grammar for CSV
+
+# end-of-record if end-of-record characters or else at eof
+def EOR:
+  consume("\n|\r\n")
+  // (if .remainder=="" then . else empty end) ;
+
+# Internal double-quotes must be doubled; 
+# CRs and LFs are allowed, as are empty quoted fields.
+def field_content_quoted:
+  parse("((\"\")|([^\"]))*") ;
+
+# # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record, 
+# i.e. only accept LF if it is NOT preceded by CR.
+# Reject unescaped double-quote
+def unquoted_field:
+  (parse("[^\",\r\n]+") | (if .remainder|test("^\r\n") then consume("\r") else empty end))
+  // parse("[^\",\n]*") # possibly empty
+  ;  
+
+# Ignore blanks before and after the enclosing quotation marks # EXTENSION
+def quoted_field:
+  consume(" *\"") 
+  | field_content_quoted
+  | consume("\" *") ;
+
+def field: 
+  quoted_field
+  // unquoted_field
+  // if at_eof then empty else stderr end ;
+
+# ("," field)+
+def fields:
+  consume(",")
+  | field
+  | (fields // .) ;
+
+# field ("," field)*
+# i.e. field fields
+def record: field | (fields // .) ;
+
+## Loop for processing all the records
+def _fromcsv:
+  if at_eof then empty
+  else (EOR // .)
+  | record
+  | select(.result)
+  | .result,
+    (.result = null | .record+=1 |  _fromcsv)
+  end ;
+
+def fromcsv:
+  {record:0, remainder: .}
+  | _fromcsv ;
+
+def fromcsvfile:
+  def finish:
+    def checkline: match( "^(( *\"(\"\"|[^\"]*)\" *)|[^\",\n]*|[ ,]+)*$") // false;
+    if checkline then .
+    else (input // "") as $in
+    | if $in != "" then (. + "\n" + $in | finish)
+      else .
+      end
+    end;
+
+  inputs as $in 
+  | ($in|finish) | fromcsv;
No results found