# Copyright (C) 2018 peak@princeton.edu # License: Creative Commons Attribution-NonCommerical-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0) # See http://creativecommons.org/licenses/by-nc/3.0/ # Attribution shall include the copyright notice above. # fromcsv.jq version: 0.4 of 2018-12-30 # Requires: jq with `inputs` # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2 # A PEG-inspired parser for reading CSV files without the need to "slurp" them. # The parser is intended to handle a wide variety of "edge cases". # Note that both \r\n and \r\r\n are interpreted as end-of-record. # Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR. # Main jq filters: # fromcsv/0 # JSON string input # fromcsvfile/0 # read from STDIN # Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv ######### PEG machinery # consume a regular expression rooted at the start of .remainder def consume($re): # on failure, match yields empty (.remainder | match("^" + $re)) as $match | .remainder |= .[$match.length :] ; def parse($re): # on failure, match yields empty (.remainder | match("^" + $re)) as $match | .remainder |= .[$match.length :] |.result += [$match.string | gsub("\"\"";"\"")] ; # Utility function as there is no EOF marker def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n"); ############ Grammar for CSV # end-of-record if end-of-record characters or else at eof def EOR: consume("\n|\r\n|\r\r\n") // (if .remainder=="" then . else empty end) ; # Internal double-quotes must be doubled; # CRs and LFs are allowed, as are empty quoted fields. def field_content_quoted: parse("((\"\")|([^\"]))*") ; # # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record, # i.e. only accept LF if it is NOT preceded by CR. # Reject unescaped double-quote def unquoted_field: (parse("[^\",\r\n]+") | (if .remainder|test("^\r\n") then consume("\r") else empty end)) // parse("[^\",\n]*") # possibly empty ; def quoted_field_continue: def trim: sub("(\r\r|\r)$";""); .remainder += ("\n" + input | trim) # | (.record|debug) as $debug | (field_content_quoted | consume("\" *")) // quoted_field_continue ; # Ignore blanks before and after the enclosing quotation marks # EXTENSION def quoted_field: consume(" *\"") | ( (field_content_quoted | consume("\" *")) // quoted_field_continue ) ; def field: quoted_field // unquoted_field // if at_eof then empty else stderr end ; # ("," field)+ def fields: consume(",") | field | (fields // .) ; # field ("," field)* # i.e. field fields def record: field | (fields // .) ; def fromcsv: ## Loop for processing all the records def _fromcsv: if at_eof then empty else (EOR // .) | record | select(.result) | .result, (.result = null | .record+=1 | _fromcsv) end ; {record:0, remainder: .} | _fromcsv ; def fromcsvfile: def trim: sub("(\r\r|\r)$";""); inputs | trim | fromcsv;