Skip to content

Instantly share code, notes, and snippets.

@brianprost
Created February 13, 2025 00:24
Show Gist options
  • Save brianprost/f723deabc2abc602e788744fb31640c8 to your computer and use it in GitHub Desktop.
Save brianprost/f723deabc2abc602e788744fb31640c8 to your computer and use it in GitHub Desktop.
sync a local folder with s3
from pathlib import Path
import mimetypes
import hashlib
import boto3
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Union
class S3Sync:
def __init__(self, bucket: str, aws_region: str = "us-east-1"):
self.bucket = bucket
self.s3 = boto3.client("s3", region_name=aws_region)
def _get_local_files(self, directory: str) -> Dict[str, Path]:
return {
path.relative_to(directory).as_posix(): path
for path in Path(directory).rglob("*")
if path.is_file()
}
def _get_s3_files(self, prefix: str = "") -> Dict[str, str]:
files = {}
paginator = self.s3.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
if "Contents" in page:
files.update(
{obj["Key"]: obj["ETag"].strip('"') for obj in page["Contents"]}
)
return files
def _calculate_md5(self, file_path: Path) -> str:
md5_hash = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()
def _upload_file(self, args: tuple) -> str:
local_path, s3_key = args
content_type = mimetypes.guess_type(local_path)[0] or "application/octet-stream"
self.s3.upload_file(
str(local_path),
self.bucket,
s3_key,
ExtraArgs={"ContentType": content_type},
)
return s3_key
def sync_directory(
self, local_dir: Union[str, Path], prefix: str = "", max_workers: int = 10
) -> List[str]:
"""
Sync a local directory to S3, uploading new or modified files.
Args:
local_dir: Local directory path to sync
prefix: S3 key prefix (folder path in bucket)
max_workers: Number of concurrent upload threads
Returns:
List of uploaded S3 keys
"""
local_files = self._get_local_files(str(local_dir))
s3_files = self._get_s3_files(prefix)
to_upload = []
for rel_path, local_path in local_files.items():
s3_key = f"{prefix}/{rel_path}" if prefix else rel_path
local_hash = self._calculate_md5(local_path)
if s3_key not in s3_files or local_hash != s3_files[s3_key]:
to_upload.append((str(local_path), s3_key))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
uploaded = list(executor.map(self._upload_file, to_upload))
return uploaded
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment