Skip to content

Instantly share code, notes, and snippets.

@pkoppstein
Last active December 31, 2018 23:24
Show Gist options
  • Select an option

  • Save pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2 to your computer and use it in GitHub Desktop.

Select an option

Save pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2 to your computer and use it in GitHub Desktop.

Revisions

  1. pkoppstein revised this gist Dec 31, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion fromcsvfile.jq
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Copyright (C) 2018 [email protected]
    # License: Creative Commons Attribution-NonCommerical-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
    # License: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
    # See http://creativecommons.org/licenses/by-nc/3.0/
    # Attribution shall include the copyright notice above.

  2. pkoppstein revised this gist Dec 31, 2018. 1 changed file with 20 additions and 15 deletions.
    35 changes: 20 additions & 15 deletions fromcsvfile.jq
    Original file line number Diff line number Diff line change
    @@ -3,21 +3,22 @@
    # See http://creativecommons.org/licenses/by-nc/3.0/
    # Attribution shall include the copyright notice above.

    # fromcsvfile.jq version: 0.3 of 2018-12-27
    # fromcsv.jq version: 0.4 of 2018-12-30
    # Requires: jq with `inputs`
    # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

    # A PEG-inspired parser for reading CSV files without the need to "slurp" them.

    # The parser is intended to handle a wide variety of "edge cases".
    # Note that both \r\n and \r\r\n are interpreted as end-of-record.

    # Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.

    # Main jq filters:
    # fromcsv/0 # JSON string input
    # fromcsvfile/0 # read from STDIN

    # Example usage: jq -nRc 'include "fromcsvfile"; fromcsfile' MYFILE.csv
    # Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv

    ######### PEG machinery

    @@ -40,7 +41,7 @@ def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");

    # end-of-record if end-of-record characters or else at eof
    def EOR:
    consume("\n|\r\n")
    consume("\n|\r\n|\r\r\n")
    // (if .remainder=="" then . else empty end) ;

    # Internal double-quotes must be doubled;
    @@ -57,7 +58,9 @@ def unquoted_field:
    ;

    def quoted_field_continue:
    .remainder += ("\n" + input)
    def trim: sub("(\r\r|\r)$";"");
    .remainder += ("\n" + input | trim)
    # | (.record|debug) as $debug
    | (field_content_quoted | consume("\" *"))
    // quoted_field_continue
    ;
    @@ -83,18 +86,20 @@ def fields:
    # i.e. field fields
    def record: field | (fields // .) ;

    ## Loop for processing all the records
    def _fromcsv:
    if at_eof then empty
    else (EOR // .)
    | record
    | select(.result)
    | .result,
    (.result = null | .record+=1 | _fromcsv)
    end ;

    def fromcsv:
    ## Loop for processing all the records
    def _fromcsv:
    if at_eof then empty
    else (EOR // .)
    | record
    | select(.result)
    | .result,
    (.result = null | .record+=1 | _fromcsv)
    end ;

    {record:0, remainder: .}
    | _fromcsv ;

    def fromcsvfile: inputs | fromcsv;
    def fromcsvfile:
    def trim: sub("(\r\r|\r)$";"");
    inputs | trim | fromcsv;
  3. pkoppstein revised this gist Dec 28, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion fromcsvfile.jq
    Original file line number Diff line number Diff line change
    @@ -3,7 +3,7 @@
    # See http://creativecommons.org/licenses/by-nc/3.0/
    # Attribution shall include the copyright notice above.

    # fromcsv.jq version: 0.3 of 2018-12-27
    # fromcsvfile.jq version: 0.3 of 2018-12-27
    # Requires: jq with `inputs`
    # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

  4. pkoppstein revised this gist Dec 28, 2018. 1 changed file with 18 additions and 18 deletions.
    36 changes: 18 additions & 18 deletions fromcsvfile.jq
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,11 @@
    # Author: [email protected]
    # Version: 0.2 of 2018-12-26
    # Copyright (C) 2018 [email protected]
    # License: Creative Commons Attribution-NonCommerical-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
    # See http://creativecommons.org/licenses/by-nc/3.0/
    # Attribution shall include the copyright notice above.

    # fromcsv.jq version: 0.3 of 2018-12-27
    # Requires: jq with `inputs`
    # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

    # A PEG-inspired parser for reading CSV files without the need to "slurp" them.

    @@ -12,7 +17,7 @@
    # fromcsv/0 # JSON string input
    # fromcsvfile/0 # read from STDIN

    # Example usage: jq -nR 'include "fromcsvfile"; fromcsfile' MYFILE.csv
    # Example usage: jq -nRc 'include "fromcsvfile"; fromcsfile' MYFILE.csv

    ######### PEG machinery

    @@ -51,11 +56,17 @@ def unquoted_field:
    // parse("[^\",\n]*") # possibly empty
    ;

    def quoted_field_continue:
    .remainder += ("\n" + input)
    | (field_content_quoted | consume("\" *"))
    // quoted_field_continue
    ;

    # Ignore blanks before and after the enclosing quotation marks # EXTENSION
    def quoted_field:
    consume(" *\"")
    | field_content_quoted
    | consume("\" *") ;
    consume(" *\"")
    | ( (field_content_quoted | consume("\" *"))
    // quoted_field_continue ) ;

    def field:
    quoted_field
    @@ -86,15 +97,4 @@ def fromcsv:
    {record:0, remainder: .}
    | _fromcsv ;

    def fromcsvfile:
    def finish:
    def checkline: test( "^((, *\"(\"\"|[^\"]*)\" *)|,[^\",\n]*|( *\"(\"\"|[^\"]*)\" *)|[^\",\n]+)*$" );
    if checkline then .
    else (input // "") as $in
    | if $in != "" then (. + "\n" + $in | finish)
    else .
    end
    end;

    inputs as $in
    | ($in|finish) | fromcsv;
    def fromcsvfile: inputs | fromcsv;
  5. pkoppstein revised this gist Dec 26, 2018. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions fromcsvfile.jq
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Author: [email protected]
    # Version: 0.1 of 2018-12-23
    # Version: 0.2 of 2018-12-26
    # Requires: jq with `inputs`

    # A PEG-inspired parser for reading CSV files without the need to "slurp" them.
    @@ -26,7 +26,7 @@ def parse($re):
    # on failure, match yields empty
    (.remainder | match("^" + $re)) as $match
    | .remainder |= .[$match.length :]
    |.result += [$match.string] ;
    |.result += [$match.string | gsub("\"\"";"\"")] ;

    # Utility function as there is no EOF marker
    def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");
    @@ -88,7 +88,7 @@ def fromcsv:

    def fromcsvfile:
    def finish:
    def checkline: match( "^(( *\"(\"\"|[^\"]*)\" *)|[^\",\n]*|[ ,]+)*$") // false;
    def checkline: test( "^((, *\"(\"\"|[^\"]*)\" *)|,[^\",\n]*|( *\"(\"\"|[^\"]*)\" *)|[^\",\n]+)*$" );
    if checkline then .
    else (input // "") as $in
    | if $in != "" then (. + "\n" + $in | finish)
  6. pkoppstein created this gist Dec 24, 2018.
    100 changes: 100 additions & 0 deletions fromcsvfile.jq
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,100 @@
    # Author: [email protected]
    # Version: 0.1 of 2018-12-23
    # Requires: jq with `inputs`

    # A PEG-inspired parser for reading CSV files without the need to "slurp" them.

    # The parser is intended to handle a wide variety of "edge cases".

    # Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.

    # Main jq filters:
    # fromcsv/0 # JSON string input
    # fromcsvfile/0 # read from STDIN

    # Example usage: jq -nR 'include "fromcsvfile"; fromcsfile' MYFILE.csv

    ######### PEG machinery

    # consume a regular expression rooted at the start of .remainder
    def consume($re):
    # on failure, match yields empty
    (.remainder | match("^" + $re)) as $match
    | .remainder |= .[$match.length :] ;

    def parse($re):
    # on failure, match yields empty
    (.remainder | match("^" + $re)) as $match
    | .remainder |= .[$match.length :]
    |.result += [$match.string] ;

    # Utility function as there is no EOF marker
    def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");

    ############ Grammar for CSV

    # end-of-record if end-of-record characters or else at eof
    def EOR:
    consume("\n|\r\n")
    // (if .remainder=="" then . else empty end) ;

    # Internal double-quotes must be doubled;
    # CRs and LFs are allowed, as are empty quoted fields.
    def field_content_quoted:
    parse("((\"\")|([^\"]))*") ;

    # # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record,
    # i.e. only accept LF if it is NOT preceded by CR.
    # Reject unescaped double-quote
    def unquoted_field:
    (parse("[^\",\r\n]+") | (if .remainder|test("^\r\n") then consume("\r") else empty end))
    // parse("[^\",\n]*") # possibly empty
    ;

    # Ignore blanks before and after the enclosing quotation marks # EXTENSION
    def quoted_field:
    consume(" *\"")
    | field_content_quoted
    | consume("\" *") ;

    def field:
    quoted_field
    // unquoted_field
    // if at_eof then empty else stderr end ;

    # ("," field)+
    def fields:
    consume(",")
    | field
    | (fields // .) ;

    # field ("," field)*
    # i.e. field fields
    def record: field | (fields // .) ;

    ## Loop for processing all the records
    def _fromcsv:
    if at_eof then empty
    else (EOR // .)
    | record
    | select(.result)
    | .result,
    (.result = null | .record+=1 | _fromcsv)
    end ;

    def fromcsv:
    {record:0, remainder: .}
    | _fromcsv ;

    def fromcsvfile:
    def finish:
    def checkline: match( "^(( *\"(\"\"|[^\"]*)\" *)|[^\",\n]*|[ ,]+)*$") // false;
    if checkline then .
    else (input // "") as $in
    | if $in != "" then (. + "\n" + $in | finish)
    else .
    end
    end;

    inputs as $in
    | ($in|finish) | fromcsv;