neoblizz · March 26, 2022 02:01 · Mar 26, 2022 · Mar 26, 2022 · Mar 26, 2022 · Mar 26, 2022
diff --git a/SoL-SpMM-GEMM.md b/SoL-SpMM-GEMM.md
@@ -61,4 +61,6 @@ Number of Reads/Writes += size of type (nzr) * nzr           // row offsets
 										 
 m, n, k = 1024
 sparsity = 90%, nnz = (m * k) * 10%
+dense = 0.2741 (ms)
+sparse = 0.0274 (ms)
 ```
diff --git a/SoL-SpMM-GEMM.md b/SoL-SpMM-GEMM.md
@@ -1,11 +1,16 @@
-# Hardware Speed-of-light analysis (volta)
+# Hardware Speed-of-Light Analysis
+The following numbers are based on NVIDIA's Volta microarchitecture. To perform a similar analysis for a newer architecture, I recommend changing the numbers below based on `device_query` CUDA sample or [wikipedia](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications) page.
+
 ```
 CUDA Cores = SM * Cores per SM (SM = 80, Cores/SM = 64)
 Maximum Clock Rate = Clock Rate (KHz) * 1e-6 (GHz)
-Maximum Throughput (type == floats, doubles or half) = CUDA Cores * Maximum Clock Rate * Type Ratio (device properties) (GFLOP/s)
+Maximum Throughput (type == floats, doubles or half) =
+    CUDA Cores * Maximum Clock Rate * Type Ratio (device properties) (GFLOP/s)
 
-Maximum Memory Bandwidth = Memory Clock Rate (KHz) * 1e-6 * Pump Rate * (Bus Width / 8) (GB/s)
-Maximum Memory Bandwidth = Frequency * Pump Rate * (Bus Width / 8) (GB/s)
+Maximum Memory Bandwidth = 
+    Memory Clock Rate (KHz) * 1e-6 * Pump Rate * (Bus Width / 8) (GB/s)
+Maximum Memory Bandwidth = 
+    Frequency * Pump Rate * (Bus Width / 8) (GB/s)
 
 Bus Width = 4096 bit
 Pump Rate = 2
@@ -14,15 +19,17 @@ Maximum Memory Bandwidth = 898 GB/s ~= 900 GB/s
 ```
 
 # Dense Matrix Multiplication
-## Problem Dependent Analysis
-A = {m, k}
-B = {k, n}
-C = {m, n}
+### Problem Dependent Analysis
+
+- A = {m × k}
+- B = {k × n}
+- C = {m × n}
+
 ```
 Number of Operations = 2 (multadd operations) * m * n * k * 1e-9 (GFLOP)
 Number of Reads/Writes = (m * k) + (k * n) + (m * n) * size of type (64/8 doubles or 32/8 floats) * 1e-9 (GByte)
 ```
-## Problem & Hardware Dependent
+### Problem & Hardware Dependent
 ```
 Speed of Light Elapsed Time (s) Compute Bound = 
 	Number of Operations (GFLOP) / Maximum Throughput (GFLOP/s)
@@ -37,11 +44,11 @@ Speed of Light Throuput (GFLOP/s) =
 ```
 
 # Sparse Matrix Multiplication
-## Problem Dependent Analysis (CSR)
-A = {m, k}
-A = {nnz, nzr}
-B = {k, n}
-C = {m, n}
+### Problem Dependent Analysis (CSR)
+
+- A = {m × k} represented as {nnz, nzr}
+- B = {k × n}
+- C = {m × n}
 
 ```
 Number of Operations =  2 (multadd) * nnz * n * 1e-9 (GFLOP)

diff --git a/SoL-SpMM-GEMM.md b/SoL-SpMM-GEMM.md
@@ -1,4 +1,5 @@
 # Hardware Speed-of-light analysis (volta)
+```
 CUDA Cores = SM * Cores per SM (SM = 80, Cores/SM = 64)
 Maximum Clock Rate = Clock Rate (KHz) * 1e-6 (GHz)
 Maximum Throughput (type == floats, doubles or half) = CUDA Cores * Maximum Clock Rate * Type Ratio (device properties) (GFLOP/s)
@@ -10,17 +11,19 @@ Bus Width = 4096 bit
 Pump Rate = 2
 Memory Clock Rate (KHz) = 877 MHz
 Maximum Memory Bandwidth = 898 GB/s ~= 900 GB/s
+```
 
 # Dense Matrix Multiplication
 ## Problem Dependent Analysis
 A = {m, k}
 B = {k, n}
 C = {m, n}
-
+```
 Number of Operations = 2 (multadd operations) * m * n * k * 1e-9 (GFLOP)
 Number of Reads/Writes = (m * k) + (k * n) + (m * n) * size of type (64/8 doubles or 32/8 floats) * 1e-9 (GByte)
-
+```
 ## Problem & Hardware Dependent
+```
 Speed of Light Elapsed Time (s) Compute Bound = 
 	Number of Operations (GFLOP) / Maximum Throughput (GFLOP/s)
 
@@ -31,21 +34,24 @@ Speed of Light Elapsed Time (s) = max(Compute, Memory)
 
 Speed of Light Throuput (GFLOP/s) = 
 	Number of Operations (GFLOP) / Speed of Light Elapsed Time (s)
-
+```
+
 # Sparse Matrix Multiplication
 ## Problem Dependent Analysis (CSR)
 A = {m, k}
 A = {nnz, nzr}
 B = {k, n}
 C = {m, n}
 
-Number of Operations =  2 (mult-add) * nnz * n * 1e-9 (GFLOP)
-Number of Reads/Writes += size of type (nzr) * nzr					 // row offsets
-										   += size of type(k) * nnz						   // column indices
-										   += size of type (values) * nnz			   // values of A
-										   += size of type (values) * (k * n)		 // values of B
-										   += size of type (values) * (m * n) 	 // values of C
-									     *= 1e-9 (GB)
+```
+Number of Operations =  2 (multadd) * nnz * n * 1e-9 (GFLOP)
+Number of Reads/Writes += size of type (nzr) * nzr           // row offsets
+                       += size of type(k) * nnz              // column indices
+                       += size of type (values) * nnz        // values of A
+                       += size of type (values) * (k * n)    // values of B
+                       += size of type (values) * (m * n)    // values of C
+                       *= 1e-9 (GB)
 										 
 m, n, k = 1024
-sparsity = 90%, nnz = (m * k) * 10%
+sparsity = 90%, nnz = (m * k) * 10%
+```
diff --git a/SoL-SpMM-GEMM.md b/SoL-SpMM-GEMM.md
@@ -0,0 +1,51 @@
+# Hardware Speed-of-light analysis (volta)
+CUDA Cores = SM * Cores per SM (SM = 80, Cores/SM = 64)
+Maximum Clock Rate = Clock Rate (KHz) * 1e-6 (GHz)
+Maximum Throughput (type == floats, doubles or half) = CUDA Cores * Maximum Clock Rate * Type Ratio (device properties) (GFLOP/s)
+
+Maximum Memory Bandwidth = Memory Clock Rate (KHz) * 1e-6 * Pump Rate * (Bus Width / 8) (GB/s)
+Maximum Memory Bandwidth = Frequency * Pump Rate * (Bus Width / 8) (GB/s)
+
+Bus Width = 4096 bit
+Pump Rate = 2
+Memory Clock Rate (KHz) = 877 MHz
+Maximum Memory Bandwidth = 898 GB/s ~= 900 GB/s
+
+# Dense Matrix Multiplication
+## Problem Dependent Analysis
+A = {m, k}
+B = {k, n}
+C = {m, n}
+
+Number of Operations = 2 (multadd operations) * m * n * k * 1e-9 (GFLOP)
+Number of Reads/Writes = (m * k) + (k * n) + (m * n) * size of type (64/8 doubles or 32/8 floats) * 1e-9 (GByte)
+
+## Problem & Hardware Dependent
+Speed of Light Elapsed Time (s) Compute Bound = 
+	Number of Operations (GFLOP) / Maximum Throughput (GFLOP/s)
+
+Speed of Light Elapsed Time (s) Memory Bound = 
+	Number of Reads/Writes (GB) /  Maximum Memory Bandwidth (GB/s)
+
+Speed of Light Elapsed Time (s) = max(Compute, Memory)
+
+Speed of Light Throuput (GFLOP/s) = 
+	Number of Operations (GFLOP) / Speed of Light Elapsed Time (s)
+
+# Sparse Matrix Multiplication
+## Problem Dependent Analysis (CSR)
+A = {m, k}
+A = {nnz, nzr}
+B = {k, n}
+C = {m, n}
+
+Number of Operations =  2 (mult-add) * nnz * n * 1e-9 (GFLOP)
+Number of Reads/Writes += size of type (nzr) * nzr					 // row offsets
+										   += size of type(k) * nnz						   // column indices
+										   += size of type (values) * nnz			   // values of A
+										   += size of type (values) * (k * n)		 // values of B
+										   += size of type (values) * (m * n) 	 // values of C
+									     *= 1e-9 (GB)
+
+m, n, k = 1024
+sparsity = 90%, nnz = (m * k) * 10%