Skip to content

Instantly share code, notes, and snippets.

@Chillee
Last active April 3, 2025 11:26
Show Gist options
  • Save Chillee/42e4635c59760a74cb3b4ba7ea5ad9f8 to your computer and use it in GitHub Desktop.
Save Chillee/42e4635c59760a74cb3b4ba7ea5ad9f8 to your computer and use it in GitHub Desktop.

Revisions

  1. Chillee revised this gist Jul 31, 2024. No changes.
  2. Chillee revised this gist Jun 21, 2024. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions mm_weird.py
    Original file line number Diff line number Diff line change
    @@ -25,7 +25,7 @@ def one_bit_random(*shape, dtype=torch.bfloat16):
    return x

    def sparse(*shape, dtype=torch.bfloat16):
    x = torch.randn(*args, **kwargs)
    x = torch.randn(*shape, dtype=dtype)
    x = torch.where(x < 0, 0, x)
    return x

    @@ -53,4 +53,4 @@ def median(x):
    return x[len(x)//2]

    for name, _ in original_setups:
    print(f"{name}: {median(results[name])/1e12}")
    print(f"{name}: {median(results[name])/1e12}")
  3. Chillee revised this gist Apr 29, 2024. 1 changed file with 100 additions and 0 deletions.
    100 changes: 100 additions & 0 deletions sustainable_clock_speed.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,100 @@
    import torch
    torch.set_default_device('cuda')
    from triton.testing import do_bench
    from collections import defaultdict
    from functools import partial
    import random
    import subprocess
    random.seed(0)

    def set_gpu_limits(ref_sm_clock=1810, power_limit=330):
    subprocess.check_output([
    "sudo",
    "nvidia-smi",
    "-i",
    "0",
    f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}",
    ])
    subprocess.check_output([
    "sudo",
    "nvidia-smi",
    "-i",
    "0",
    f"-pl={power_limit}",
    ])
    def get_flops(A, B):
    ms = do_bench(lambda: torch.mm(A, B))
    flops = A.shape[0] * A.shape[1] * B.shape[1] * 2
    return (1e3/ms) * flops

    M = 8192
    N = 8192
    K = 8192
    def get_tensors(f):
    A = f(M, K, dtype=torch.bfloat16)
    B = f(N, K, dtype=torch.bfloat16).t()
    return A, B

    def one_bit_random(*shape, dtype=torch.bfloat16):
    x = torch.randn(*shape, dtype=dtype)
    x = (x.view(torch.int16) & 0b1000).to(dtype=dtype)
    return x

    def sparse(*shape, dtype=torch.bfloat16):
    x = torch.randn(*shape, dtype=dtype)
    x = torch.where(torch.rand_like(x) > 0.1, 0, x)
    return x

    def checkerboard(*shape, dtype=torch.bfloat16):
    x = torch.randn(*shape, dtype=dtype)
    x = torch.where((torch.arange(shape[0]).view(1, -1) - torch.arange(shape[1]).view(-1, 1)) % 2 == 0, x, 0)
    return x

    def ternary(*shape, dtype=torch.bfloat16):
    x = torch.randint(low=-1, high=2, size=shape, dtype=torch.bfloat16)
    return x

    original_setups = [
    # ("zeros", torch.zeros),
    ("randn", torch.randn),
    # ("checkerboard", checkerboard),
    # ("sparse", sparse),
    # ("rand", torch.rand),
    # ("ternary", ternary),
    # ("one bit", one_bit_random),
    # ("all_pi", lambda *shape, dtype: torch.full(shape, fill_value=3.1415926535897932384626, dtype=dtype)),
    # ("twos", lambda *shape, dtype: torch.full(shape, fill_value=2, dtype=dtype)),
    ]
    def get_results(clocks, power):
    set_gpu_limits(clocks, power)
    results = defaultdict(list)
    setups = list(original_setups)
    ITERS = 10
    for _ in range(ITERS):
    random.shuffle(setups)
    for name, f in setups:
    results[name].append(get_flops(*get_tensors(f)))

    def median(x):
    x = sorted(x)
    if len(x) % 2 == 0:
    return (x[len(x)//2] + x[(len(x) - 1)//2])/2
    else:
    return x[len(x)//2]

    # for name, _ in original_setups:
    # print(f"{name}: {median(results[name])/1e12}")
    # print(median(results['zeros']) / median(results["randn"]))
    return median(results['randn'])

    start_clocks = 1980 # H100
    for power in reversed([150, 200, 250, 300, 350, 400, 450, 500]):
    max_clocks = 1980 # H100
    start_flops = get_results(max_clocks, power)
    for clocks in range(start_clocks, 200, -100):
    # print(power, clocks)
    cur_flops = get_results(clocks, power)
    if cur_flops < start_flops * 0.9:
    print("Done: ", power, clocks)
    start_clocks = clocks
    break
  4. Chillee revised this gist Apr 28, 2024. No changes.
  5. Chillee created this gist Apr 28, 2024.
    56 changes: 56 additions & 0 deletions mm_weird.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,56 @@
    import torch
    torch.set_default_device('cuda')
    from triton.testing import do_bench
    from collections import defaultdict
    from functools import partial
    import random
    random.seed(0)

    def get_flops(A, B):
    ms = do_bench(lambda: torch.mm(A, B))
    flops = A.shape[0] * A.shape[1] * B.shape[1] * 2
    return (1e3/ms) * flops

    M = 8192
    N = 8192
    K = 8192
    def get_tensors(f):
    A = f(M, K, dtype=torch.bfloat16)
    B = f(N, K, dtype=torch.bfloat16).t()
    return A, B

    def one_bit_random(*shape, dtype=torch.bfloat16):
    x = torch.randn(*shape, dtype=dtype)
    x = (x.view(torch.int16) & 0b1000).to(dtype=dtype)
    return x

    def sparse(*shape, dtype=torch.bfloat16):
    x = torch.randn(*args, **kwargs)
    x = torch.where(x < 0, 0, x)
    return x

    original_setups = [
    ("randn", torch.randn),
    ("twos", lambda *shape, dtype: torch.full(shape, fill_value=2, dtype=dtype)),
    ("sparse", sparse),
    ("one bit", one_bit_random),
    ("rand", torch.rand),
    ("zeros", torch.zeros),
    ]
    results = defaultdict(list)
    setups = list(original_setups)
    ITERS = 10
    for _ in range(ITERS):
    random.shuffle(setups)
    for name, f in setups:
    results[name].append(get_flops(*get_tensors(f)))

    def median(x):
    x = sorted(x)
    if len(x) % 2 == 0:
    return (x[len(x)//2] + x[(len(x) - 1)//2])/2
    else:
    return x[len(x)//2]

    for name, _ in original_setups:
    print(f"{name}: {median(results[name])/1e12}")