# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch import time import torch # 400000000B/1000000 = 400 MB a = torch.randn(10000, 10000, device="cuda") torch.softmax(a, dim=1) torch.cuda.synchronize() def flush_cache(): a.zero_() times = [] for i in range(1000): t0 = time.perf_counter() torch.softmax(a, dim=1) t1 = time.perf_counter() times.append(t1 - t0) print(f"perf_counter no sync Time: {1000*sum(times):.4f} us") torch.softmax(a, dim=1) torch.cuda.synchronize() times = [] for i in range(1000): flush_cache() torch.cuda.synchronize() t0 = time.perf_counter() torch.softmax(a, dim=1) torch.cuda.synchronize() t1 = time.perf_counter() times.append(t1 - t0) print(f"perf_counter Time: {1000*sum(times):.4f} us") torch.softmax(a, dim=1) torch.cuda.synchronize() a.zero_() times = [] for i in range(1000): flush_cache() torch.cuda.synchronize() t0 = time.perf_counter_ns() torch.softmax(a, dim=1) torch.cuda.synchronize() t1 = time.perf_counter_ns() times.append(t1 - t0) print(f"perf_counter_ns Time: {sum(times)/1000/1000:.4f} us") torch.softmax(a, dim=1) torch.cuda.synchronize() times = [] start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) for i in range(1000): flush_cache() start.record() torch.softmax(a, dim=1) end.record() torch.cuda.synchronize() times.append(start.elapsed_time(end)) print(f"cuda.Event Time: {sum(times):.4f} us") torch.softmax(a, dim=1) torch.cuda.synchronize() a.zero_() starts = [torch.cuda.Event(enable_timing=True) for _ in range(1000)] ends = [torch.cuda.Event(enable_timing=True) for _ in range(1000)] for i in range(1000): flush_cache() torch.cuda._sleep(1_000_000) starts[i].record() torch.softmax(a, dim=1) ends[i].record() torch.cuda.synchronize() times = [starts[i].elapsed_time(ends[i]) for i in range(1000)] print(f"cuda.Event list Time: {sum(times):.4f} us") # without flush_cache and without torch.cuda._sleep # perf_counter no sync Time: 4.2106 us # perf_counter Time: 950.8353 us # perf_counter_ns Time: 950.6415 us # cuda.Event Time: 948.8796 us # cuda.Event list Time: 945.8083 us # with flush_cache and torch.cuda._sleep # perf_counter no sync Time: 4.2853 us # perf_counter Time: 958.5552 us # perf_counter_ns Time: 958.4630 us # cuda.Event Time: 953.4228 us # cuda.Event list Time: 952.6513 us