cli99 · September 20, 2024 06:08 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/test_timing.py b/test_timing.py
@@ -5,7 +5,7 @@
 import torch
 
 # 400000000B/1000000 = 400 MB
-a = torch.randn(10000, 10000, device="cuda")
+a = torch.randn(1000, 1000, device="cuda")
 
 torch.softmax(a, dim=1)
 torch.cuda.synchronize()
@@ -103,3 +103,10 @@ def flush_cache():
 # perf_counter_ns Time: 958.4630 us
 # cuda.Event Time: 953.4228 us
 # cuda.Event list Time: 952.6513 us
+
+# a = torch.randn(1000, 1000, device="cuda") with flush_cache and torch.cuda._sleep
+# perf_counter no sync Time: 4.5707 us
+# perf_counter Time: 11.7443 us
+# perf_counter_ns Time: 11.7657 us
+# cuda.Event Time: 13.3076 us
+# cuda.Event list Time: 5.8498 us
diff --git a/test_timing.py b/test_timing.py
@@ -4,11 +4,17 @@
 
 import torch
 
+# 400000000B/1000000 = 400 MB
 a = torch.randn(10000, 10000, device="cuda")
 
 torch.softmax(a, dim=1)
 torch.cuda.synchronize()
 
+
+def flush_cache():
+    a.zero_()
+
+
 times = []
 for i in range(1000):
     t0 = time.perf_counter()
@@ -23,6 +29,8 @@
 
 times = []
 for i in range(1000):
+    flush_cache()
+    torch.cuda.synchronize()
     t0 = time.perf_counter()
     torch.softmax(a, dim=1)
     torch.cuda.synchronize()
@@ -33,9 +41,12 @@
 
 torch.softmax(a, dim=1)
 torch.cuda.synchronize()
+a.zero_()
 
 times = []
 for i in range(1000):
+    flush_cache()
+    torch.cuda.synchronize()
     t0 = time.perf_counter_ns()
     torch.softmax(a, dim=1)
     torch.cuda.synchronize()
@@ -51,6 +62,7 @@
 start = torch.cuda.Event(enable_timing=True)
 end = torch.cuda.Event(enable_timing=True)
 for i in range(1000):
+    flush_cache()
     start.record()
     torch.softmax(a, dim=1)
     end.record()
@@ -61,10 +73,14 @@
 
 
 torch.softmax(a, dim=1)
+torch.cuda.synchronize()
+a.zero_()
 
 starts = [torch.cuda.Event(enable_timing=True) for _ in range(1000)]
 ends = [torch.cuda.Event(enable_timing=True) for _ in range(1000)]
 for i in range(1000):
+    flush_cache()
+    torch.cuda._sleep(1_000_000)
     starts[i].record()
     torch.softmax(a, dim=1)
     ends[i].record()
@@ -74,10 +90,16 @@
 
 print(f"cuda.Event list Time: {sum(times):.4f} us")
 
-
+# without flush_cache and without torch.cuda._sleep
 # perf_counter no sync Time: 4.2106 us
 # perf_counter Time: 950.8353 us
 # perf_counter_ns Time: 950.6415 us
 # cuda.Event Time: 948.8796 us
 # cuda.Event list Time: 945.8083 us
 
+# with flush_cache and torch.cuda._sleep
+# perf_counter no sync Time: 4.2853 us
+# perf_counter Time: 958.5552 us
+# perf_counter_ns Time: 958.4630 us
+# cuda.Event Time: 953.4228 us
+# cuda.Event list Time: 952.6513 us
diff --git a/test_timing.py b/test_timing.py
@@ -75,7 +75,9 @@
 print(f"cuda.Event list Time: {sum(times):.4f} us")
 
 
-# perf_counter Time: 11.4154 us
-# perf_counter_ns Time: 11.2727 us
-# cuda.Event list Time: 13.3122 us
-# cuda.Event Time: 12.3625 us
+# perf_counter no sync Time: 4.2106 us
+# perf_counter Time: 950.8353 us
+# perf_counter_ns Time: 950.6415 us
+# cuda.Event Time: 948.8796 us
+# cuda.Event list Time: 945.8083 us
+
diff --git a/test_timing.py b/test_timing.py
@@ -1,10 +1,25 @@
+# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch
+
 import time
 
 import torch
 
-a = torch.randn(1000, 1000, device="cuda")
+a = torch.randn(10000, 10000, device="cuda")
 
 torch.softmax(a, dim=1)
+torch.cuda.synchronize()
+
+times = []
+for i in range(1000):
+    t0 = time.perf_counter()
+    torch.softmax(a, dim=1)
+    t1 = time.perf_counter()
+    times.append(t1 - t0)
+
+print(f"perf_counter no sync Time: {1000*sum(times):.4f} us")
+
+torch.softmax(a, dim=1)
+torch.cuda.synchronize()
 
 times = []
 for i in range(1000):
@@ -16,6 +31,7 @@
 
 print(f"perf_counter Time: {1000*sum(times):.4f} us")
 
+torch.softmax(a, dim=1)
 torch.cuda.synchronize()
 
 times = []
@@ -29,6 +45,7 @@
 print(f"perf_counter_ns Time: {sum(times)/1000/1000:.4f} us")
 
 torch.softmax(a, dim=1)
+torch.cuda.synchronize()
 
 times = []
 start = torch.cuda.Event(enable_timing=True)

diff --git a/test_timing.py b/test_timing.py
@@ -0,0 +1,64 @@
+import time
+
+import torch
+
+a = torch.randn(1000, 1000, device="cuda")
+
+torch.softmax(a, dim=1)
+
+times = []
+for i in range(1000):
+    t0 = time.perf_counter()
+    torch.softmax(a, dim=1)
+    torch.cuda.synchronize()
+    t1 = time.perf_counter()
+    times.append(t1 - t0)
+
+print(f"perf_counter Time: {1000*sum(times):.4f} us")
+
+torch.cuda.synchronize()
+
+times = []
+for i in range(1000):
+    t0 = time.perf_counter_ns()
+    torch.softmax(a, dim=1)
+    torch.cuda.synchronize()
+    t1 = time.perf_counter_ns()
+    times.append(t1 - t0)
+
+print(f"perf_counter_ns Time: {sum(times)/1000/1000:.4f} us")
+
+torch.softmax(a, dim=1)
+
+times = []
+start = torch.cuda.Event(enable_timing=True)
+end = torch.cuda.Event(enable_timing=True)
+for i in range(1000):
+    start.record()
+    torch.softmax(a, dim=1)
+    end.record()
+    torch.cuda.synchronize()
+    times.append(start.elapsed_time(end))
+
+print(f"cuda.Event Time: {sum(times):.4f} us")
+
+
+torch.softmax(a, dim=1)
+
+starts = [torch.cuda.Event(enable_timing=True) for _ in range(1000)]
+ends = [torch.cuda.Event(enable_timing=True) for _ in range(1000)]
+for i in range(1000):
+    starts[i].record()
+    torch.softmax(a, dim=1)
+    ends[i].record()
+
+torch.cuda.synchronize()
+times = [starts[i].elapsed_time(ends[i]) for i in range(1000)]
+
+print(f"cuda.Event list Time: {sum(times):.4f} us")
+
+
+# perf_counter Time: 11.4154 us
+# perf_counter_ns Time: 11.2727 us
+# cuda.Event list Time: 13.3122 us
+# cuda.Event Time: 12.3625 us