cas-- · April 3, 2025 14:36 · Apr 3, 2025
diff --git a/s3_tar_extract.py b/s3_tar_extract.py
@@ -0,0 +1,98 @@
+#! /usr/bin/env -S uv run
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "boto3",
+#     "click",
+#     "zstandard",
+# ]
+# ///
+"""
+Stream and extract a tar file from S3 using minimal memory and disk.
+
+This is a Python alternative to using AWS cli:
+
+    aws s3 cp s3://example-bucket/file.tar.gz - | tar -xz -C $(mktemp -d)
+    aws s3 cp s3://example-bucket/file.tar.zst - | tar --zstd -x -C $(mktemp -d)
+
+Requires: [uv](https://docs.astral.sh/uv/getting-started/installation/)
+
+Usage:
+
+    ./s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
+    uv run s3_tar_extract.py s3://example-bucket/file.tar.zst $(mktemp -d)
+"""
+
+import click
+import boto3
+import tarfile
+
+import zstandard
+
+import os
+
+
+def s3_streaming_body(s3_path):
+    """Return a file-like streaming body object from an S3 object"""
+    s3_client = boto3.client("s3")
+
+    bucket_name, object_key = s3_path.replace("s3://", "").split("/", 1)
+
+    s3_response = s3_client.get_object(Bucket=bucket_name, Key=object_key)
+
+    return s3_response["Body"]
+
+
+def extract_tar_gz(s3_path, extract_path):
+    """
+    Stream a tar.gz file from S3 and extract it.
+
+    Args:
+        bucket_name (str): S3 bucket name
+        object_key (str): S3 object key of the tar.gz file
+        extract_path (str): Local directory to extract contents to
+    """
+    streaming_body = s3_streaming_body(s3_path)
+
+    with tarfile.open(fileobj=streaming_body, mode="r|gz") as tar:
+        tar.extractall(path=extract_path, filter="tar")
+
+
+def extract_tar_zstd(s3_path, extract_path):
+    """Stream a zstd-compressed tar file from S3 and extract it.
+
+    Args:
+        bucket_name (str): S3 bucket name
+        object_key (str): S3 object key of the zstd-compressed file
+        extract_path (str): Local directory to extract contents to
+    """
+    streaming_body = s3_streaming_body(s3_path)
+
+    # Create a stream reader that decompresses as it reads
+    dctx = zstandard.ZstdDecompressor()
+    reader = dctx.stream_reader(streaming_body)
+
+    with tarfile.open(fileobj=reader, mode="r|") as tar:
+        tar.extractall(path=extract_path, filter="tar")
+
+
+@click.command()
+@click.argument("s3_path")
+@click.argument("extract_path")
+def extract(s3_path, extract_path):
+    """Stream and extract tar file from S3 using minimal memory and disk."""
+    os.makedirs(extract_path, exist_ok=True)
+
+    click.echo(f"Extracting {s3_path} to {extract_path}...")
+    if s3_path.endswith(".tar.gz"):
+        extract_tar_gz(s3_path, extract_path)
+    elif s3_path.endswith(".tar.zst"):
+        extract_tar_zstd(s3_path, extract_path)
+    else:
+        raise ValueError("Tar file must end with .tar.gz or .tar.zst")
+
+    click.echo("Done extracting.")
+
+
+if __name__ == "__main__":
+    extract()
No results found