Skip to content

Instantly share code, notes, and snippets.

@arthemus
Created January 7, 2025 19:24
Show Gist options
  • Save arthemus/ab3fb7be19f3c6e9d352c760a18b581b to your computer and use it in GitHub Desktop.
Save arthemus/ab3fb7be19f3c6e9d352c760a18b581b to your computer and use it in GitHub Desktop.
save_parquet
import getpass
USER = getpass.getuser()
print("User =", USER)
DOWNLOAD_BASE_PATH = f"/user/{USER}/notebooks/downloads"
def save_parquet(df: DataFrame, folder_name: str):
folder_to_download = f"{DOWNLOAD_BASE_PATH}/{folder_name}"
print(f"Working at {folder_to_download}...")
!hdfs dfs -rm -r -f -skipTrash {folder_to_download}/*
df.write.mode('overwrite').parquet(folder_to_download)
!hdfs dfs -copyToLocal {folder_to_download} {folder_name}
!tar cvzf {folder_name}.tar.gz {folder_name}/*
!rm -rf {folder_name}
!hdfs dfs -copyFromLocal {folder_name}.tar.gz {folder_to_download}
!hdfs dfs -rm -r -f -skipTrash {folder_to_download}/*.parquet
!rm {folder_name}.tar.gz
!hdfs dfs -ls {folder_to_download}
print(f"{folder_name}.tar.gz file available to download at {folder_to_download}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment