summaryrefslogtreecommitdiff
path: root/examples/gpu
diff options
context:
space:
mode:
Diffstat (limited to 'examples/gpu')
-rw-r--r--examples/gpu/cuda_vector_add.zc73
1 files changed, 73 insertions, 0 deletions
diff --git a/examples/gpu/cuda_vector_add.zc b/examples/gpu/cuda_vector_add.zc
new file mode 100644
index 0000000..de75a74
--- /dev/null
+++ b/examples/gpu/cuda_vector_add.zc
@@ -0,0 +1,73 @@
+
+// Compile with: zc run cuda_vector_add.zc --cuda
+
+//> cflags: -arch=sm_75
+
+import "std/cuda.zc"
+import "std/mem.zc"
+
+@global
+fn add_kernel(a: float*, b: float*, c: float*, n: int) {
+ var i = thread_id();
+ if i < n {
+ c[i] = a[i] + b[i];
+ }
+}
+
+fn main() {
+ const N = 1024;
+
+ "=> Zen C CUDA Vector Addition";
+ "-> Vector size: {N} elements";
+
+ var h_a = alloc_n<float>(N);
+ var h_b = alloc_n<float>(N);
+ var h_c = alloc_n<float>(N);
+ defer free(h_a);
+ defer free(h_b);
+ defer free(h_c);
+
+ for i in 0..N {
+ h_a[i] = (float)i;
+ h_b[i] = (float)(i * 2);
+ }
+
+ "-> Allocating device memory...";
+ var d_a = cuda_alloc<float>(N);
+ var d_b = cuda_alloc<float>(N);
+ var d_c = cuda_alloc<float>(N);
+ defer cuda_free(d_a);
+ defer cuda_free(d_b);
+ defer cuda_free(d_c);
+
+ cuda_copy_to_device(d_a, h_a, N * sizeof(float));
+ cuda_copy_to_device(d_b, h_b, N * sizeof(float));
+
+ const BLOCK_SIZE = 256;
+ var num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+ "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads";
+
+ launch add_kernel(d_a, d_b, d_c, N) with {
+ grid: num_blocks,
+ block: BLOCK_SIZE
+ };
+
+ cuda_sync();
+
+ cuda_copy_to_host(h_c, d_c, N * sizeof(float));
+
+ "-> Verifying...";
+ var ok: int = 1;
+ for i in 0..10 {
+ var expected = h_a[i] + h_b[i];
+ if h_c[i] != expected {
+ !"-> Mismatch at {i}";
+ ok = 0;
+ }
+ }
+
+ if ok {
+ "-> All checks passed!";
+ }
+}