ZenanH/test.jl

ZenanH · 2024-03-06T08:19:51Z

On the RTX3090, this code takes around 20 ms. However, if we add one line like this:

@kernel inbounds = true function test1!(mp_p2n, mp_Ni, mp_∂Nx, mp_∂Ny, mp_∂Nz, mp_pos, grid_pos)
    ix = @index(Global)
    for iy in Int32(1):Int(64)
        p2n = mp_p2n[ix, iy]
        Nx, dNx = uGIMPbasis(mp_pos[ix, 1]-grid_pos[p2n, 1], Float64(2.0), Float64(1.0))
        Ny, dNy = uGIMPbasis(mp_pos[ix, 2]-grid_pos[p2n, 2], Float64(3.0), Float64(2.0))
        Nz, dNz = uGIMPbasis(mp_pos[ix, 3]-grid_pos[p2n, 3], Float64(4.0), Float64(3.0))
        Nx = dNx = Ny = dNy = Nz = dNz = Float64(2)
        mp_Ni[ix, iy] = Nx * Ny * Nz
        mp_∂Nx[ix, iy] = dNx * Ny * Nz
        mp_∂Ny[ix, iy] = dNy * Nx * Nz
        mp_∂Nz[ix, iy] = dNz * Nx * Ny
    end
end

then it only takes 1.24 ms

ZenanH · 2024-03-06T08:35:16Z

It happens on both of It happens on both of FP32 and FP64.

ZenanH · 2024-03-06T09:00:02Z

On the RTX 3090

ZenanH · 2024-03-06T09:01:01Z

V100 is better:

ZenanH · 2024-03-12T16:37:26Z

😅 I cannot know if the compiler ignores the function invocations from code_llvm, the left one has Nx = dNx = Ny = dNy = Nz = dNz = Float64(2), the right one doesn't have.

But I try to save the results into an Array: test like this:

@kernel inbounds = true function test1!(mp_p2n, mp_Ni, mp_∂Nx, mp_∂Ny, mp_∂Nz, mp_pos, grid_pos, test)
    ix = @index(Global)
    for iy in Int32(1):Int(64)
        p2n = mp_p2n[ix, iy]
        Nx, dNx = uGIMPbasis(mp_pos[ix, 1]-grid_pos[p2n, 1], Float64(2.0), Float64(1.0))
        Ny, dNy = uGIMPbasis(mp_pos[ix, 2]-grid_pos[p2n, 2], Float64(3.0), Float64(2.0))
        Nz, dNz = uGIMPbasis(mp_pos[ix, 3]-grid_pos[p2n, 3], Float64(4.0), Float64(3.0))

        # save the results
        test[ix] = Nx * Ny * Nz + dNx + dNy + dNz
        

        Nx = dNx = Ny = dNy = Nz = dNz = Float64(2)
        mp_Ni[ix, iy] = Nx * Ny * Nz
        mp_∂Nx[ix, iy] = dNx * Ny * Nz
        mp_∂Ny[ix, iy] = dNy * Nx * Nz
        mp_∂Nz[ix, iy] = dNz * Nx * Ny
    end
end

Then, it takes 20 ms.

That means, as you said, it didn't call the function `uGIMPbasis()`.

With the dataset I used above, the registers used are around 83.

ZenanH · 2024-03-12T16:45:37Z

I changed the uGIMPbasis() as follow:

@inline Base.@propagate_inbounds function uGIMPbasis1(Δx::T2, h::T2, lp::T2) where T2
    T2 == Float32 ? T1 = Int32 :
    T2 == Float64 ? T1 = Int64 : nothing
    absΔx = abs(Δx); signΔx = sign(Δx)
    if absΔx<(T2(0.5)*lp)
        N = T2(1.0)-((T2(4)*(Δx^T1(2))+lp^T1(2))/(T2(4)*h*lp))
        dN = -((T2(8.0)*Δx)/(T2(4)*h*lp))
    elseif (T2(0.5)*lp)≤absΔx<(h-T2(0.5)*lp)
        N = T2(1.0)-(absΔx/h)
        dN = signΔx*(-T2(1.0)/h)
    elseif (h-T2(0.5)*lp)≤absΔx<(h+T2(0.5)*lp)
        N = ((h+T2(0.5)*lp-absΔx)^T1(2))/(T2(2.0)*h*lp)
        dN = -signΔx*((h+T2(0.5)*lp-absΔx)/(h*lp))
    else
        N = T2(0.0)
        dN = T2(0.0)
    end
    return T2(N), T2(dN)
end

Now it takes 10 ms, only half the time of the previous one with 74 registers.

I will keep trying to reduce the number of registers😎. @luraess

luraess · 2024-03-12T17:31:10Z

I guess now it may take some time because of branches. But cool timing improves

ZenanH · 2024-04-29T13:52:41Z

Hi @utkinis 👋! Thanks for the disscussion during EGU.

If I understand well, to improve the efficiency of my kernel function (to compute and save basis function), two things are worth to do:

transpose the Array Ni, ∂Nx, ∂Ny, ∂Nx
unroll the for loop

For point 1 ❓

I did a simple test below, however it is slower than the original version. I tried the same method on my code, but it also slower.
By the way, I also tried to use a 2d block for CUDA launch configuration, it is much slower than a 1d block.

using KernelAbstractions
using CUDA
using BenchmarkTools
using Printf

# without transpose
@kernel inbounds=true function kernel1!(a, b, c)
    ix = @index(Global)
    for iy in 1:64
        a[ix, iy] = b[ix, iy] + c[ix, iy] - a[ix, iy] ^ 2
        b[ix, iy] = a[ix, iy] + c[ix, iy] - b[ix, iy] ^ 2
        c[ix, iy] = a[ix, iy] + b[ix, iy] - c[ix, iy] ^ 2
    end
end

@kernel inbounds=true function kernel2!(a, b, c)
    ix = @index(Global)
    for iy in 1:64
        a[iy, ix] = b[iy, ix] + c[iy, ix] - a[iy, ix] ^ 2
        b[iy, ix] = a[iy, ix] + c[iy, ix] - b[iy, ix] ^ 2
        c[iy, ix] = a[iy, ix] + b[iy, ix] - c[iy, ix] ^ 2
    end
end

n = 512000
a = CUDA.rand(Float64, n, 64)
b = CUDA.rand(Float64, n, 64)
c = CUDA.rand(Float64, n, 64)
d = CUDA.rand(Float64, 64, n)
e = CUDA.rand(Float64, 64, n)
f = CUDA.rand(Float64, 64, n)
dev = CUDABackend()

rst1 = @belapsed begin 
    kernel1!($dev)(ndrange=$n, $a, $b, $c)
    KernelAbstractions.synchronize($dev)
end

rst2 = @belapsed begin 
    kernel2!($dev)(ndrange=$n, $d, $e, $f)
    KernelAbstractions.synchronize($dev)
end

@info """\t
rst1: $(@sprintf("%.2e ms", rst1*1e3))
rst2: $(@sprintf("%.2e ms", rst2*1e3))
"""

output:

┌ Info: 
│ rst1: 2.03e+00 ms
└ rst2: 2.93e+01 ms

For point 2

Unrolling the for-loop by using Base.Cartesian.@nexprs reduced the computing time indeed, around 20%. 🎉

utkinis · 2024-05-06T14:10:27Z

Hi @ZenanH ! Thanks for the follow-up, cool that you could make a progress with implementation and a speedup with loop unrolling =)

As for the point 1, I guess it makes sense to figure out exactly what happens here. I've made a version of your example with some extra kernels:

using KernelAbstractions
using CUDA
using BenchmarkTools
using Printf

# without transpose
@kernel inbounds = true function kernel1!(a, b, c)
    ix = @index(Global, Linear)
    for iy in 1:64
        a[ix, iy] = b[ix, iy] + c[ix, iy] - a[ix, iy]^2
        b[ix, iy] = a[ix, iy] + c[ix, iy] - b[ix, iy]^2
        c[ix, iy] = a[ix, iy] + b[ix, iy] - c[ix, iy]^2
    end
end

@kernel inbounds = true function kernel2!(a, b, c)
    ix = @index(Global, Linear)
    for iy in 1:64
        a[iy, ix] = b[iy, ix] + c[iy, ix] - a[iy, ix]^2
        b[iy, ix] = a[iy, ix] + c[iy, ix] - b[iy, ix]^2
        c[iy, ix] = a[iy, ix] + b[iy, ix] - c[iy, ix]^2
    end
end

# with extra parallelism
# without transpose
@kernel inbounds = true function kernel3!(a, b, c)
    ix = @index(Group, Linear)
    iy = @index(Local, Linear)
    a[ix, iy] = b[ix, iy] + c[ix, iy] - a[ix, iy]^2
    b[ix, iy] = a[ix, iy] + c[ix, iy] - b[ix, iy]^2
    c[ix, iy] = a[ix, iy] + b[ix, iy] - c[ix, iy]^2
end

@kernel inbounds = true function kernel4!(a, b, c)
    ix = @index(Group, Linear)
    iy = @index(Local, Linear)
    a[iy, ix] = b[iy, ix] + c[iy, ix] - a[iy, ix]^2
    b[iy, ix] = a[iy, ix] + c[iy, ix] - b[iy, ix]^2
    c[iy, ix] = a[iy, ix] + b[iy, ix] - c[iy, ix]^2
end

# grid-stride version (or in this case, rather block-stride)
@kernel inbounds = true function kernel5!(a, b, c)
    ix = @index(Group, Linear)
    tx = @index(Local, Linear)
    stride = prod(@groupsize())
    for iy in tx:stride:64
        a[iy, ix] = b[iy, ix] + c[iy, ix] - a[iy, ix]^2
        b[iy, ix] = a[iy, ix] + c[iy, ix] - b[iy, ix]^2
        c[iy, ix] = a[iy, ix] + b[iy, ix] - c[iy, ix]^2
    end
end

n = 512000
a = CUDA.rand(Float64, n, 64)
b = CUDA.rand(Float64, n, 64)
c = CUDA.rand(Float64, n, 64)
d = CUDA.rand(Float64, 64, n)
e = CUDA.rand(Float64, 64, n)
f = CUDA.rand(Float64, 64, n)
dev = CUDABackend()

rst1 = @belapsed begin
    kernel1!($dev)($a, $b, $c; ndrange=$n)
    KernelAbstractions.synchronize($dev)
end

rst2 = @belapsed begin
    kernel2!($dev)($d, $e, $f; ndrange=$n)
    KernelAbstractions.synchronize($dev)
end

# group size is equal to number of elements in the first dimension
wg = 64

# worksize is total number of elements in the array
ws = wg * n

rst3 = @belapsed begin
    kernel3!($dev, $wg)($a, $b, $c; ndrange=$ws)
    KernelAbstractions.synchronize($dev)
end

rst4 = @belapsed begin
    kernel4!($dev, $wg)($d, $e, $f; ndrange=$ws)
    KernelAbstractions.synchronize($dev)
end

# groupsize is defined by ourselves (a bit more flexibility)
wg = 32

# worksize is defined by block size and number of elements
ws = wg * n

rst5 = @belapsed begin
    kernel5!($dev, $wg)($d, $e, $f; ndrange=$ws)
    KernelAbstractions.synchronize($dev)
end

@info """\t
rst1 (   loop,  no transpose): $(@sprintf("%.2e ms", rst1*1e3))
rst2 (   loop,     transpose): $(@sprintf("%.2e ms", rst2*1e3))
rst3 (no loop,  no transpose): $(@sprintf("%.2e ms", rst3*1e3))
rst4 (no loop,     transpose): $(@sprintf("%.2e ms", rst4*1e3))
rst5 (grid-stride, transpose): $(@sprintf("%.2e ms", rst5*1e3))
"""

which gives the following output on A100 NVidia GPU:

┌ Info: 
│ rst1 (   loop,  no transpose): 1.22e+00 ms
│ rst2 (   loop,     transpose): 1.64e+01 ms
│ rst3 (no loop,  no transpose): 1.33e+01 ms
│ rst4 (no loop,     transpose): 1.18e+00 ms
└ rst5 (grid-stride, transpose): 1.18e+00 ms

Here's what happens in my opinion (ideally, one need to profile the code with Nsight, which I didn't do, so take it with a grain of salt):

For optimal performance on GPUs it is crucial that the global memory accesses are coalesced (see here for more info). To achieve that, we need to make sure that the consecutive threads (or work items in AMD/KernelAbstractions jargon) access consecutive array indices (there's more to that, but I'm not really an expert in GPU optimisation tbh and not the best person to explain this things). What happens in the transposed version of your code, i.e., kernel2!, is that e.g., on the first iteration of the inner loop, when, iy = 1, thread 1 accesses a[1,1], thread 2 accesses a[1,2] etc. Since Julia is column-major, that means that the neighbour threads access array elements that are 64 elements apart from each other. This is clearly not optimal for GPU access.

Why then the first version (kernel1!) is faster? I think that KernelAbstraction picks the block size (same as group size) that is proportional to the inner loop size, i.e., 64. Usually the preferred block size is proportional to the warp size, which is 32 on all Nvidia devices. If the picked block size is equal to 64, then the thread 1 accesses a[1,1] and thread 2 accesses a[2,1], which is optimal access pattern. I didn't think of it when we had this discussion at EGU, sorry =)

Now, one solution would be to always fix the number of threads in a block to 64 and use kernel1!, which is perfectly fine in your case. However, If the size of the inner loop is not 64 but something else you might experience sudden performance issues on some architectures. Another possibility is to manually control the group size and enforce correct memory access pattern. In kernels kernel3! and kernel4! I do exactly this by launching n groups each having 64 threads. Then I use the local thread index instead of inner loop. As you can see from benchmarking, now we have exactly the opposite situation, with the transposed matrix being faster, and a bit faster than the old version as well.

Finally, to decouple work size from the inner loop size, we can use something similar to the "grid-stride loop" (see here for more info). In kernel 5 I apply this only to the first dimension in the array, however, it can be used for both dimensions, and then if for some reason you'll have more or less neighbour weights then 64, you can change it independently from the block size.

	using KernelAbstractions
	using CUDA
	using BenchmarkTools

	@inline function uGIMPbasis(Δx::T2, h::T2, lp::T2) where T2
	T2 == Float32 ? T1 = Int32 :
	T2 == Float64 ? T1 = Int64 : nothing
	absΔx = abs(Δx); signΔx = sign(Δx)
	c1 = absΔx<(T2(0.5)*lp)
	c2 = (T2(0.5)lp)≤absΔx<(h-T2(0.5)lp)
	c3 = (h-T2(0.5)lp)≤absΔx<(h+T2(0.5)lp)
	Ni1 = T2(1.0)-((T2(4)(Δx^T1(2))+lp^T1(2))/(T2(4)h*lp))
	Ni2 = T2(1.0)-(absΔx/h)
	Ni3 = ((h+T2(0.5)lp-absΔx)^T1(2))/(T2(2.0)h*lp)
	dN1 = -((T2(8.0)Δx)/(T2(4)h*lp))
	dN2 = signΔx*(-T2(1.0)/h)
	dN3 = -signΔx((h+T2(0.5)lp-absΔx)/(h*lp))
	N = c1Ni1+c2Ni2+c3*Ni3
	dN = c1dN1+c2dN2+c3*dN3
	return T2(N), T2(dN)
	end

	@kernel inbounds = true function test1!(mp_p2n, mp_Ni, mp_∂Nx, mp_∂Ny, mp_∂Nz, mp_pos, grid_pos)
	ix = @index(Global)
	for iy in Int32(1):Int(64)
	p2n = mp_p2n[ix, iy]
	Nx, dNx = uGIMPbasis(mp_pos[ix, 1]-grid_pos[p2n, 1], Float64(2.0), Float64(1.0))
	Ny, dNy = uGIMPbasis(mp_pos[ix, 2]-grid_pos[p2n, 2], Float64(3.0), Float64(2.0))
	Nz, dNz = uGIMPbasis(mp_pos[ix, 3]-grid_pos[p2n, 3], Float64(4.0), Float64(3.0))
	mp_Ni[ix, iy] = Nx * Ny * Nz
	mp_∂Nx[ix, iy] = dNx * Ny * Nz
	mp_∂Ny[ix, iy] = dNy * Nx * Nz
	mp_∂Nz[ix, iy] = dNz * Nx * Ny
	end
	end

	mp_p2n = cu(rand(1:1:710733, 512000, 64) .\|> Int64)
	mp_Ni = CUDA.zeros(Float64, 512000, 64)
	mp_∂Nx = CUDA.zeros(Float64, 512000, 64)
	mp_∂Ny = CUDA.zeros(Float64, 512000, 64)
	mp_∂Nz = CUDA.zeros(Float64, 512000, 64)
	mp_pos = CUDA.rand(Float64, 512000, 3)
	grid_pos = CUDA.rand(Float64, 710733, 3)

	test1!(CUDABackend())(ndrange=512000, mp_p2n, mp_Ni, mp_∂Nx, mp_∂Ny, mp_∂Nz, mp_pos, grid_pos)

	time = 1e3 * @belapsed begin
	test1!($CUDABackend())(ndrange=$512000, $mp_p2n, $mp_Ni, $mp_∂Nx, $mp_∂Ny, $mp_∂Nz, $mp_pos, $grid_pos)
	CUDA.synchronize()
	end

ZenanH/test.jl

Select an option

No results found

Select an option

No results found

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 12, 2024

Uh oh!

ZenanH commented Mar 12, 2024

Uh oh!

luraess commented Mar 12, 2024

Uh oh!

ZenanH commented Apr 29, 2024

Uh oh!

utkinis commented May 6, 2024 •

edited

Loading

Uh oh!

ZenanH/test.jl

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 6, 2024

Uh oh!

ZenanH commented Mar 12, 2024

That means, as you said, it didn't call the function uGIMPbasis().

Uh oh!

ZenanH commented Mar 12, 2024

Uh oh!

luraess commented Mar 12, 2024

Uh oh!

ZenanH commented Apr 29, 2024

For point 1 ❓

For point 2

Uh oh!

utkinis commented May 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

That means, as you said, it didn't call the function `uGIMPbasis()`.

utkinis commented May 6, 2024 •

edited

Loading