bijoyboban7 · January 16, 2025 14:29 · Jan 16, 2025 · Jan 16, 2025
diff --git a/get_memory_size.py b/get_memory_size.py
@@ -3,6 +3,9 @@
 import argparse
 import sys
 
+# Example: 
+# python get_gpu_memory.py Qwen/Qwen2.5-7B-Instruct
+
 # Dictionary mapping dtype strings to their byte sizes
 bytes_per_dtype: Dict[str, float] = {
     "int4": 0.5,

diff --git a/get_memory_size.py b/get_memory_size.py
@@ -0,0 +1,106 @@
+from typing import Dict, Union
+from huggingface_hub import get_safetensors_metadata
+import argparse
+import sys
+
+# Dictionary mapping dtype strings to their byte sizes
+bytes_per_dtype: Dict[str, float] = {
+    "int4": 0.5,
+    "int8": 1,
+    "float8": 1,
+    "float16": 2,
+    "float32": 4,
+}
+
+
+def calculate_gpu_memory(parameters: float, bytes: float) -> float:
+    """Calculates the GPU memory required for serving a Large Language Model (LLM).
+
+    This function estimates the GPU memory needed using the formula:
+    M = (P * 4B) / (32 / Q) * 1.18
+
+    where:
+    - M is the GPU memory in Gigabytes
+    - P is the number of parameters in billions (e.g., 7 for a 7B model)
+    - 4B represents 4 bytes per parameter
+    - 32 represents bits in 4 bytes
+    - Q is the quantization bits (e.g., 16, 8, or 4 bits)
+    - 1.18 represents ~18% overhead for additional GPU memory requirements
+
+    Args:
+        parameters: Number of model parameters in billions
+        bytes: Number of bytes per parameter based on dtype
+
+    Returns:
+        Estimated GPU memory required in Gigabytes
+
+    Examples:
+        >>> calculate_gpu_memory(7, bytes_per_dtype["float16"])
+        13.72
+        >>> calculate_gpu_memory(13, bytes_per_dtype["int8"])
+        12.74
+    """
+    memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2)
+    return memory
+
+
+def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]:
+    """Get the estimated GPU memory requirement for a Hugging Face model.
+
+    Args:
+        model_id: Hugging Face model ID (e.g., "facebook/opt-350m")
+        dtype: Data type for model loading ("float16", "int8", etc.)
+
+    Returns:
+        Estimated GPU memory in GB, or None if estimation fails
+
+    Examples:
+        >>> get_model_size("facebook/opt-350m")
+        0.82
+        >>> get_model_size("meta-llama/Llama-2-7b-hf", dtype="int8")
+        6.86
+    """
+    try:
+        if dtype not in bytes_per_dtype:
+            raise ValueError(
+                f"Unsupported dtype: {dtype}. Supported types: {list(bytes_per_dtype.keys())}"
+            )
+
+        metadata = get_safetensors_metadata(model_id)
+        if not metadata or not metadata.parameter_count:
+            raise ValueError(f"Could not fetch metadata for model: {model_id}")
+
+        model_parameters = list(metadata.parameter_count.values())[0]
+        model_parameters = int(model_parameters) / 1_000_000_000  # Convert to billions
+        return calculate_gpu_memory(model_parameters, bytes_per_dtype[dtype])
+
+    except Exception as e:
+        print(f"Error estimating model size: {str(e)}", file=sys.stderr)
+        return None
+
+
+def main():
+    """Command-line interface for GPU memory estimation."""
+    parser = argparse.ArgumentParser(
+        description="Estimate GPU memory requirements for Hugging Face models"
+    )
+    parser.add_argument(
+        "model_id", help="Hugging Face model ID (e.g., Qwen/Qwen2.5-7B-Instruct)"
+    )
+    parser.add_argument(
+        "--dtype",
+        default="float16",
+        choices=bytes_per_dtype.keys(),
+        help="Data type for model loading",
+    )
+
+    args = parser.parse_args()
+    size = get_model_size(args.model_id, args.dtype)
+
+    print(
+        f"Estimated GPU memory requirement for {args.model_id}: {size:.2f} GB ({args.dtype})"
+    )
+
+
+if __name__ == "__main__":
+    main()