Skip to content

Instantly share code, notes, and snippets.

@tim-hub
Forked from madjo/converter.py
Created January 4, 2024 04:39
Show Gist options
  • Select an option

  • Save tim-hub/2ffbadc4bc19d9e54ab1169cd81785fc to your computer and use it in GitHub Desktop.

Select an option

Save tim-hub/2ffbadc4bc19d9e54ab1169cd81785fc to your computer and use it in GitHub Desktop.

Revisions

  1. @madjo madjo created this gist Oct 28, 2019.
    52 changes: 52 additions & 0 deletions converter.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,52 @@
    #! /usr/bin/env python
    import pandas as pd
    import argparse
    import os
    import pyarrow.orc as orc

    # usage: ./converter.py --help

    CONVERTERS = ['parquet2json', 'orc2json', 'orc2parquet']

    def parquet2json(filename):
    df = pd.read_parquet(f'{filename}.parquet')
    df.to_json(f'{filename}.json', orient='records')


    def orc2json(filename):
    data = orc.ORCFile(f'{filename}.orc')
    df = data.read().to_pandas()
    df.to_json(f'{filename}.json', orient='records')


    def orc2parquet(filename):
    data = orc.ORCFile(f'{filename}.orc')
    df = data.read().to_pandas()
    df.to_parquet(f'{filename}.parquet', compression='gzip')


    def convert(args):
    converter = args.converter
    filename = os.path.splitext(args.sourcefile)[0]

    switcher = {
    "parquet2json": parquet2json,
    "orc2json": orc2json,
    "orc2parquet": orc2parquet
    }

    converter_func = switcher.get(converter)
    converter_func(filename)


    def main():
    parser = argparse.ArgumentParser(description="Convert parquet to json | ocr to json | orc to parquet.")
    parser.add_argument("converter", choices=CONVERTERS)
    parser.add_argument("sourcefile")
    parser.set_defaults(func=convert)
    args = parser.parse_args()
    args.func(args)


    if __name__ == "__main__":
    main()