Skip to content

Instantly share code, notes, and snippets.

@bfahr
Created May 20, 2019 22:16
Show Gist options
  • Save bfahr/7cc77ce3daad48ddc55e28d8da722db1 to your computer and use it in GitHub Desktop.
Save bfahr/7cc77ce3daad48ddc55e28d8da722db1 to your computer and use it in GitHub Desktop.

Revisions

  1. bfahr created this gist May 20, 2019.
    37 changes: 37 additions & 0 deletions arrow.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,37 @@
    import logging

    import pandas as pd
    import pyarrow.parquet as pq
    import pyarrow.orc as orc
    import pyarrow as pa


    def read_parquet(filepath):
    logging.debug('Reading parquet file: %s', filepath)
    return pq.read_table(filepath).to_pandas()


    def read_parquet_partitioned(root_path):
    logging.debug('Reading parquet partitioned from path: %s', root_path);
    dataset = pq.ParquetDataset(root_path)
    table = dataset.read()
    return table.to_pandas()


    def read_orc(filepath):
    logging.debug('Reading orc file: %s', filepath)
    orc_file = orc.ORCFile(filepath)
    table = orc_file.read()
    return pd.DataFrame(table.to_pydict())


    def write_parquet(df, filepath):
    logging.debug('Writing parquet dataframe: %s', filepath)
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(table, filepath, compression='snappy')


    def write_parquet_partitioned(df, root_path, partition_cols):
    logging.debug('Writing partitioned parquet dataframe: %s, With columns: %s', root_path, partition_cols)
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_to_dataset(table, root_path=root_path, partition_cols=partition_cols, compression='snappy')