// Compile with: zc run cuda_vector_add.zc --cuda //> cflags: -arch=sm_75 import "std/cuda.zc" import "std/mem.zc" @global fn add_kernel(a: float*, b: float*, c: float*, n: int) { let i = thread_id(); if i < n { c[i] = a[i] + b[i]; } } fn main() { const N = 1024; "=> Zen C CUDA Vector Addition"; "-> Vector size: {N} elements"; let h_a = alloc_n(N); let h_b = alloc_n(N); let h_c = alloc_n(N); defer free(h_a); defer free(h_b); defer free(h_c); for i in 0..N { h_a[i] = (float)i; h_b[i] = (float)(i * 2); } "-> Allocating device memory..."; let d_a = cuda_alloc(N); let d_b = cuda_alloc(N); let d_c = cuda_alloc(N); defer cuda_free(d_a); defer cuda_free(d_b); defer cuda_free(d_c); cuda_copy_to_device(d_a, h_a, N * sizeof(float)); cuda_copy_to_device(d_b, h_b, N * sizeof(float)); const BLOCK_SIZE = 256; let num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads"; launch add_kernel(d_a, d_b, d_c, N) with { grid: num_blocks, block: BLOCK_SIZE }; cuda_sync(); cuda_copy_to_host(h_c, d_c, N * sizeof(float)); "-> Verifying..."; let ok: int = 1; for i in 0..10 { let expected = h_a[i] + h_b[i]; if h_c[i] != expected { !"-> Mismatch at {i}"; ok = 0; } } if ok { "-> All checks passed!"; } }