Skip to content

Instantly share code, notes, and snippets.

@mvdoc
Last active May 26, 2024 18:34
Show Gist options
  • Save mvdoc/c46e050bda45d3cb5b36ed40c77f2c24 to your computer and use it in GitHub Desktop.
Save mvdoc/c46e050bda45d3cb5b36ed40c77f2c24 to your computer and use it in GitHub Desktop.

Revisions

  1. mvdoc revised this gist May 26, 2024. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion get_size_localcopy_annex.py
    Original file line number Diff line number Diff line change
    @@ -7,7 +7,7 @@

    def get_files_with_one_copy():
    try:
    result = subprocess.run(['git-annex', 'find', '--copies=1', '--in=here'], capture_output=True, text=True, check=True)
    result = subprocess.run(['git-annex', 'find', '--copies=1', '--and', '--not', '--copies=2', '--and', '--in=here'], capture_output=True, text=True, check=True)
    files = result.stdout.splitlines()
    return files
    except subprocess.CalledProcessError as e:
    @@ -16,6 +16,9 @@ def get_files_with_one_copy():

    def get_file_size(file):
    try:
    # result = subprocess.run(['git-annex', 'info', file, '--json', '--bytes', '--fast'], capture_output=True, text=True, check=True)
    # info = json.loads(result.stdout)
    # return int(info['size'])
    result = subprocess.run(['du', '-bL', file], capture_output=True, text=True, check=True)
    return int(result.stdout.split()[0])
    except subprocess.CalledProcessError as e:
  2. mvdoc created this gist May 26, 2024.
    42 changes: 42 additions & 0 deletions get_size_localcopy_annex.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    # This script computes the total size of git-annex files with only a single local copy.
    # It's useful to figure out how much data will be used if all the files were to be archived.
    import subprocess
    from tqdm import tqdm
    import json
    import os

    def get_files_with_one_copy():
    try:
    result = subprocess.run(['git-annex', 'find', '--copies=1', '--in=here'], capture_output=True, text=True, check=True)
    files = result.stdout.splitlines()
    return files
    except subprocess.CalledProcessError as e:
    print(f"Error finding files: {e}")
    return []

    def get_file_size(file):
    try:
    result = subprocess.run(['du', '-bL', file], capture_output=True, text=True, check=True)
    return int(result.stdout.split()[0])
    except subprocess.CalledProcessError as e:
    print(f"Error getting info for {file}: {e}")
    return 0
    except (json.JSONDecodeError, KeyError) as e:
    print(f"Error parsing info for {file}: {e}")
    return 0

    def main():
    files = get_files_with_one_copy()
    total_size = 0

    for file in tqdm(files, desc="Processing files"):
    size = get_file_size(file)
    total_size += size

    human_readable_size = subprocess.run(['numfmt', '--to=iec-i', '--suffix=B', str(total_size)], capture_output=True, text=True).stdout.strip()
    n_files = len(files)

    print(f"Total size of {n_files} files with only one copy: {human_readable_size}")

    if __name__ == "__main__":
    main()