// Compile with: zc run cuda_vector_add.zc --cuda

//> cflags: -arch=sm_75

import "std/cuda.zc"
import "std/mem.zc"

@global
fn add_kernel(a: float*, b: float*, c: float*, n: int) {
    let i = thread_id();
    if i < n {
        c[i] = a[i] + b[i];
    }
}

fn main() {
    const N = 1024;
    
    "=> Zen C CUDA Vector Addition";
    "-> Vector size: {N} elements";
    
    let h_a = alloc_n<float>(N);
    let h_b = alloc_n<float>(N);
    let h_c = alloc_n<float>(N);
    defer free(h_a);
    defer free(h_b);
    defer free(h_c);
    
    for i in 0..N {
        h_a[i] = (float)i;
        h_b[i] = (float)(i * 2);
    }
    
    "-> Allocating device memory...";
    let d_a = cuda_alloc<float>(N);
    let d_b = cuda_alloc<float>(N);
    let d_c = cuda_alloc<float>(N);
    defer cuda_free(d_a);
    defer cuda_free(d_b);
    defer cuda_free(d_c);
    
    cuda_copy_to_device(d_a, h_a, N * sizeof(float));
    cuda_copy_to_device(d_b, h_b, N * sizeof(float));
    
    const BLOCK_SIZE = 256;
    let num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads";
    
    launch add_kernel(d_a, d_b, d_c, N) with {
        grid: num_blocks,
        block: BLOCK_SIZE
    };
    
    cuda_sync();
    
    cuda_copy_to_host(h_c, d_c, N * sizeof(float));
    
    "-> Verifying...";
    let ok: int = 1;
    for i in 0..10 {
        let expected = h_a[i] + h_b[i];
        if h_c[i] != expected {
            !"-> Mismatch at {i}";
            ok = 0;
        }
    }
    
    if ok {
        "-> All checks passed!";
    }
}