Created
May 20, 2019 22:16
-
-
Save bfahr/7cc77ce3daad48ddc55e28d8da722db1 to your computer and use it in GitHub Desktop.
Revisions
-
bfahr created this gist
May 20, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,37 @@ import logging import pandas as pd import pyarrow.parquet as pq import pyarrow.orc as orc import pyarrow as pa def read_parquet(filepath): logging.debug('Reading parquet file: %s', filepath) return pq.read_table(filepath).to_pandas() def read_parquet_partitioned(root_path): logging.debug('Reading parquet partitioned from path: %s', root_path); dataset = pq.ParquetDataset(root_path) table = dataset.read() return table.to_pandas() def read_orc(filepath): logging.debug('Reading orc file: %s', filepath) orc_file = orc.ORCFile(filepath) table = orc_file.read() return pd.DataFrame(table.to_pydict()) def write_parquet(df, filepath): logging.debug('Writing parquet dataframe: %s', filepath) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_table(table, filepath, compression='snappy') def write_parquet_partitioned(df, root_path, partition_cols): logging.debug('Writing partitioned parquet dataframe: %s, With columns: %s', root_path, partition_cols) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_to_dataset(table, root_path=root_path, partition_cols=partition_cols, compression='snappy')