Skip to content

Instantly share code, notes, and snippets.

@ylow
ylow / content_defined_parquet_writer.py
Created September 30, 2024 01:15
Content Defined Parquet Writing Prototype
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
min_row_count = 512
max_row_count = 2048
def write_parquet_content_defined(df: pd.DataFrame, key_column: str, output_file: str):
# Initialize the Parquet writer object
writer = None
batch_accumulator = []