Skip to content

Instantly share code, notes, and snippets.

@bijoyboban7
Forked from philschmid/get_memory_size.py
Created January 16, 2025 14:29
Show Gist options
  • Save bijoyboban7/75135aaab63bd3d66e4342cffb250b71 to your computer and use it in GitHub Desktop.
Save bijoyboban7/75135aaab63bd3d66e4342cffb250b71 to your computer and use it in GitHub Desktop.

Revisions

  1. @philschmid philschmid revised this gist Jan 16, 2025. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions get_memory_size.py
    Original file line number Diff line number Diff line change
    @@ -3,6 +3,9 @@
    import argparse
    import sys

    # Example:
    # python get_gpu_memory.py Qwen/Qwen2.5-7B-Instruct

    # Dictionary mapping dtype strings to their byte sizes
    bytes_per_dtype: Dict[str, float] = {
    "int4": 0.5,
  2. @philschmid philschmid created this gist Jan 16, 2025.
    106 changes: 106 additions & 0 deletions get_memory_size.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,106 @@
    from typing import Dict, Union
    from huggingface_hub import get_safetensors_metadata
    import argparse
    import sys

    # Dictionary mapping dtype strings to their byte sizes
    bytes_per_dtype: Dict[str, float] = {
    "int4": 0.5,
    "int8": 1,
    "float8": 1,
    "float16": 2,
    "float32": 4,
    }


    def calculate_gpu_memory(parameters: float, bytes: float) -> float:
    """Calculates the GPU memory required for serving a Large Language Model (LLM).
    This function estimates the GPU memory needed using the formula:
    M = (P * 4B) / (32 / Q) * 1.18
    where:
    - M is the GPU memory in Gigabytes
    - P is the number of parameters in billions (e.g., 7 for a 7B model)
    - 4B represents 4 bytes per parameter
    - 32 represents bits in 4 bytes
    - Q is the quantization bits (e.g., 16, 8, or 4 bits)
    - 1.18 represents ~18% overhead for additional GPU memory requirements
    Args:
    parameters: Number of model parameters in billions
    bytes: Number of bytes per parameter based on dtype
    Returns:
    Estimated GPU memory required in Gigabytes
    Examples:
    >>> calculate_gpu_memory(7, bytes_per_dtype["float16"])
    13.72
    >>> calculate_gpu_memory(13, bytes_per_dtype["int8"])
    12.74
    """
    memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
    return memory


    def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]:
    """Get the estimated GPU memory requirement for a Hugging Face model.
    Args:
    model_id: Hugging Face model ID (e.g., "facebook/opt-350m")
    dtype: Data type for model loading ("float16", "int8", etc.)
    Returns:
    Estimated GPU memory in GB, or None if estimation fails
    Examples:
    >>> get_model_size("facebook/opt-350m")
    0.82
    >>> get_model_size("meta-llama/Llama-2-7b-hf", dtype="int8")
    6.86
    """
    try:
    if dtype not in bytes_per_dtype:
    raise ValueError(
    f"Unsupported dtype: {dtype}. Supported types: {list(bytes_per_dtype.keys())}"
    )

    metadata = get_safetensors_metadata(model_id)
    if not metadata or not metadata.parameter_count:
    raise ValueError(f"Could not fetch metadata for model: {model_id}")

    model_parameters = list(metadata.parameter_count.values())[0]
    model_parameters = int(model_parameters) / 1_000_000_000 # Convert to billions
    return calculate_gpu_memory(model_parameters, bytes_per_dtype[dtype])

    except Exception as e:
    print(f"Error estimating model size: {str(e)}", file=sys.stderr)
    return None


    def main():
    """Command-line interface for GPU memory estimation."""
    parser = argparse.ArgumentParser(
    description="Estimate GPU memory requirements for Hugging Face models"
    )
    parser.add_argument(
    "model_id", help="Hugging Face model ID (e.g., Qwen/Qwen2.5-7B-Instruct)"
    )
    parser.add_argument(
    "--dtype",
    default="float16",
    choices=bytes_per_dtype.keys(),
    help="Data type for model loading",
    )

    args = parser.parse_args()
    size = get_model_size(args.model_id, args.dtype)

    print(
    f"Estimated GPU memory requirement for {args.model_id}: {size:.2f} GB ({args.dtype})"
    )


    if __name__ == "__main__":
    main()