Skip to content

Instantly share code, notes, and snippets.

@cas--
Created April 3, 2025 14:36
Show Gist options
  • Select an option

  • Save cas--/314dde7ed51e7b3e9183eef4ee76bc02 to your computer and use it in GitHub Desktop.

Select an option

Save cas--/314dde7ed51e7b3e9183eef4ee76bc02 to your computer and use it in GitHub Desktop.

Revisions

  1. cas-- created this gist Apr 3, 2025.
    98 changes: 98 additions & 0 deletions s3_tar_extract.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@
    #! /usr/bin/env -S uv run
    # /// script
    # requires-python = ">=3.12"
    # dependencies = [
    # "boto3",
    # "click",
    # "zstandard",
    # ]
    # ///
    """
    Stream and extract a tar file from S3 using minimal memory and disk.
    This is a Python alternative to using AWS cli:
    aws s3 cp s3://example-bucket/file.tar.gz - | tar -xz -C $(mktemp -d)
    aws s3 cp s3://example-bucket/file.tar.zst - | tar --zstd -x -C $(mktemp -d)
    Requires: [uv](https://docs.astral.sh/uv/getting-started/installation/)
    Usage:
    ./s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
    uv run s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
    """

    import click
    import boto3
    import tarfile

    import zstandard

    import os


    def s3_streaming_body(s3_path):
    """Return a file-like streaming body object from an S3 object"""
    s3_client = boto3.client("s3")

    bucket_name, object_key = s3_path.replace("s3://", "").split("/", 1)

    s3_response = s3_client.get_object(Bucket=bucket_name, Key=object_key)

    return s3_response["Body"]


    def extract_tar_gz(s3_path, extract_path):
    """
    Stream a tar.gz file from S3 and extract it.
    Args:
    bucket_name (str): S3 bucket name
    object_key (str): S3 object key of the tar.gz file
    extract_path (str): Local directory to extract contents to
    """
    streaming_body = s3_streaming_body(s3_path)

    with tarfile.open(fileobj=streaming_body, mode="r|gz") as tar:
    tar.extractall(path=extract_path, filter="tar")


    def extract_tar_zstd(s3_path, extract_path):
    """Stream a zstd-compressed tar file from S3 and extract it.
    Args:
    bucket_name (str): S3 bucket name
    object_key (str): S3 object key of the zstd-compressed file
    extract_path (str): Local directory to extract contents to
    """
    streaming_body = s3_streaming_body(s3_path)

    # Create a stream reader that decompresses as it reads
    dctx = zstandard.ZstdDecompressor()
    reader = dctx.stream_reader(streaming_body)

    with tarfile.open(fileobj=reader, mode="r|") as tar:
    tar.extractall(path=extract_path, filter="tar")


    @click.command()
    @click.argument("s3_path")
    @click.argument("extract_path")
    def extract(s3_path, extract_path):
    """Stream and extract tar file from S3 using minimal memory and disk."""
    os.makedirs(extract_path, exist_ok=True)

    click.echo(f"Extracting {s3_path} to {extract_path}...")
    if s3_path.endswith(".tar.gz"):
    extract_tar_gz(s3_path, extract_path)
    elif s3_path.endswith(".tar.zst"):
    extract_tar_zstd(s3_path, extract_path)
    else:
    raise ValueError("Tar file must end with .tar.gz or .tar.zst")

    click.echo("Done extracting.")


    if __name__ == "__main__":
    extract()