diff options
Diffstat (limited to 'examples/gpu/cuda_vector_add.zc')
| -rw-r--r-- | examples/gpu/cuda_vector_add.zc | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/examples/gpu/cuda_vector_add.zc b/examples/gpu/cuda_vector_add.zc new file mode 100644 index 0000000..de75a74 --- /dev/null +++ b/examples/gpu/cuda_vector_add.zc @@ -0,0 +1,73 @@ + +// Compile with: zc run cuda_vector_add.zc --cuda + +//> cflags: -arch=sm_75 + +import "std/cuda.zc" +import "std/mem.zc" + +@global +fn add_kernel(a: float*, b: float*, c: float*, n: int) { + var i = thread_id(); + if i < n { + c[i] = a[i] + b[i]; + } +} + +fn main() { + const N = 1024; + + "=> Zen C CUDA Vector Addition"; + "-> Vector size: {N} elements"; + + var h_a = alloc_n<float>(N); + var h_b = alloc_n<float>(N); + var h_c = alloc_n<float>(N); + defer free(h_a); + defer free(h_b); + defer free(h_c); + + for i in 0..N { + h_a[i] = (float)i; + h_b[i] = (float)(i * 2); + } + + "-> Allocating device memory..."; + var d_a = cuda_alloc<float>(N); + var d_b = cuda_alloc<float>(N); + var d_c = cuda_alloc<float>(N); + defer cuda_free(d_a); + defer cuda_free(d_b); + defer cuda_free(d_c); + + cuda_copy_to_device(d_a, h_a, N * sizeof(float)); + cuda_copy_to_device(d_b, h_b, N * sizeof(float)); + + const BLOCK_SIZE = 256; + var num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; + + "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads"; + + launch add_kernel(d_a, d_b, d_c, N) with { + grid: num_blocks, + block: BLOCK_SIZE + }; + + cuda_sync(); + + cuda_copy_to_host(h_c, d_c, N * sizeof(float)); + + "-> Verifying..."; + var ok: int = 1; + for i in 0..10 { + var expected = h_a[i] + h_b[i]; + if h_c[i] != expected { + !"-> Mismatch at {i}"; + ok = 0; + } + } + + if ok { + "-> All checks passed!"; + } +} |
