Skip to content

Instantly share code, notes, and snippets.

@bennadel
Created March 3, 2017 12:49
Show Gist options
  • Save bennadel/c5655d472268e83214a549010d841ac4 to your computer and use it in GitHub Desktop.
Save bennadel/c5655d472268e83214a549010d841ac4 to your computer and use it in GitHub Desktop.

Revisions

  1. bennadel created this gist Mar 3, 2017.
    5 changes: 5 additions & 0 deletions data.ndjson
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,5 @@
    {"id":1,"name":"O Brother, Where Art Thou?"}
    {"id":2,"name":"Home for the Holidays"}
    {"id":3,"name":"The Firm"}
    {"id":4,"name":"Broadcast News"}
    {"id":5,"name":"Raising Arizona"}
    106 changes: 106 additions & 0 deletions test.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,106 @@
    // Require the core node modules.
    var chalk = require( "chalk" );
    var fileSystem = require( "fs" );
    var ndjson = require( "ndjson" );


    // ----------------------------------------------------------------------------------- //
    // ----------------------------------------------------------------------------------- //


    // Imagine that we are performing some sort of data migration and we have to move data
    // from one database to flat files; then transport those flat files elsewhere; then,
    // import those flat files into a different database.
    var records = [
    { id: 1, name: "O Brother, Where Art Thou?" },
    { id: 2, name: "Home for the Holidays" },
    { id: 3, name: "The Firm" },
    { id: 4, name: "Broadcast News" },
    { id: 5, name: "Raising Arizona" }
    // .... hundreds of thousands of records ....
    ];

    // Traditionally, we might store ONE JSON document PER FILE. However, this has some
    // serious implications once we move out of local development environment and into
    // production. As the JSON documents grow in size, we run the risk of running out of
    // memory (during the serialization and parsing process). To get around this, we can
    // use a slightly different storage format in which our data file is not ONE JSON
    // document PER FILE, but rather ONE JSON document PER LINE. This is known as "ndjson"
    // or "Newline-Delimited JSON". To use this format, we're going to create an ndjson
    // Transform stream (aka "through" stream) that takes each JavaScript object and
    // writes it as a newline-delimited String to the output stream (which will be a
    // file-output stream in our case).
    // --
    // NOTE: We're using .ndjson - NOT .json - for this storage format.
    var transformStream = ndjson.stringify();

    // Pipe the ndjson serialized output to the file-system.
    var outputStream = transformStream.pipe( fileSystem.createWriteStream( __dirname + "/data.ndjson" ) );

    // Iterate over the records and write EACH ONE to the TRANSFORM stream individually.
    // Each one of these records will become a line in the output file.
    records.forEach(
    function iterator( record ) {

    transformStream.write( record );

    }
    );

    // Once we've written each record in the record-set, we have to end the stream so that
    // the TRANSFORM stream knows to flush and close the file output stream.
    transformStream.end();

    // Once ndjson has flushed all data to the output stream, let's indicate done.
    outputStream.on(
    "finish",
    function handleFinish() {

    console.log( chalk.green( "ndjson serialization complete!" ) );
    console.log( "- - - - - - - - - - - - - - - - - - - - - - -" );

    }
    );


    // ----------------------------------------------------------------------------------- //
    // ----------------------------------------------------------------------------------- //


    // Since the stream actions are event-driven (and asynchronous), we have to wait until
    // our output stream has been closed before we can try reading it back in.
    outputStream.on(
    "finish",
    function handleFinish() {

    // When we read the file back into memory, ndjson will stream, buffer, and split
    // the content based on the newline character. It will then parse each newline-
    // delimited value as a JSON object and emit it from the TRANSFORM stream.
    var inputStream = fileSystem.createReadStream( __dirname + "/data.ndjson" );
    var transformStream = inputStream.pipe( ndjson.parse() );

    transformStream
    // Each "data" event will emit one item from our original record-set.
    .on(
    "data",
    function handleRecord( data ) {

    console.log( chalk.red( "Record (event):" ), data );

    }
    )

    // Once ndjson has parsed all the input, let's indicate done.
    .on(
    "end",
    function handleEnd() {

    console.log( "- - - - - - - - - - - - - - - - - - - - - - -" );
    console.log( chalk.green( "ndjson parsing complete!" ) );

    }
    )
    ;

    }
    );