bfahr · May 20, 2019 22:16 · May 20, 2019
diff --git a/arrow.py b/arrow.py
@@ -0,0 +1,37 @@
+import logging
+
+import pandas as pd
+import pyarrow.parquet as pq
+import pyarrow.orc as orc
+import pyarrow as pa
+
+
+def read_parquet(filepath):
+    logging.debug('Reading parquet file: %s', filepath)
+    return pq.read_table(filepath).to_pandas()
+
+
+def read_parquet_partitioned(root_path):
+    logging.debug('Reading parquet partitioned from path: %s', root_path);
+    dataset = pq.ParquetDataset(root_path)
+    table = dataset.read()
+    return table.to_pandas()
+
+
+def read_orc(filepath):
+    logging.debug('Reading orc file: %s', filepath)
+    orc_file = orc.ORCFile(filepath)
+    table = orc_file.read()
+    return pd.DataFrame(table.to_pydict())
+
+
+def write_parquet(df, filepath):
+    logging.debug('Writing parquet dataframe: %s', filepath)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    pq.write_table(table, filepath, compression='snappy')
+
+
+def write_parquet_partitioned(df, root_path, partition_cols):
+    logging.debug('Writing partitioned parquet dataframe: %s,  With columns: %s', root_path, partition_cols)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    pq.write_to_dataset(table, root_path=root_path, partition_cols=partition_cols, compression='snappy')