1 files changed, 403 insertions, 0 deletions
diff --git a/examples/gpu/cuda-benchmark.zc b/examples/gpu/cuda-benchmark.zc
new file mode 100644
index 0000000..d426e10
--- /dev/null
+++ b/examples/gpu/cuda-benchmark.zc
@@ -0,0 +1,403 @@
+// ========================================
+// GPU Benchmark - Zen-C CUDA
+// ========================================
+
+import "std/cuda.zc"
+import "std/mem.zc"
+
+// ========================================
+// Kernel 1: Matrix Multiplication
+// ========================================
+
+@global
+fn matrix_multiply_kernel(A: float*, B: float*, C: float*, N: int) {
+    var row = block_id_y() * block_dim_y() + thread_id_y();
+    var col = block_id_x() * block_dim_x() + thread_id_x();
+
+    if row < N && col < N {
+        var sum = 0.0f;
+        for k in 0..N {
+            sum = sum + A[row * N + k] * B[k * N + col];
+        }
+        C[row * N + col] = sum;
+    }
+}
+
+// ========================================
+// Kernel 2: Monte Carlo Pi Estimation
+// ========================================
+
+@global
+fn monte_carlo_pi_kernel(results: float*, num_samples: u64, seed: u64) {
+    var idx = block_id_x() * block_dim_x() + thread_id_x();
+    var total_threads = grid_dim_x() * block_dim_x();
+
+    var local_count: u64 = 0;
+    var samples_per_thread = num_samples / total_threads;
+
+    // Simple random number generator
+    var rand_state = seed + idx;
+
+    for i in 0..samples_per_thread {
+        // Generate random x and y in [0, 1]
+        var a: u64 = 1103515245u64;
+        var b: u64 = 12345u64;
+        var m: u64 = 2147483648u64;
+
+        rand_state = (a * rand_state + b) % m;
+        var x = (float)rand_state / 2147483648.0f;
+
+        rand_state = (a * rand_state + b) % m;
+        var y = (float)rand_state / 2147483648.0f;
+
+        // Check if point is inside quarter circle
+        if x * x + y * y <= 1.0f {
+            local_count = local_count + 1;
+        }
+    }
+    results[idx] = (float)local_count;
+}
+
+// ========================================
+// Kernel 3: N-Body Simulation
+// ========================================
+
+@global
+fn nbody_kernel(x: float*, y: float*, z: float*,
+                vx: float*, vy: float*, vz: float*,
+                N: int, dt: float) {
+    var i = thread_id();
+
+    if i < N {
+        var ax = 0.0f;
+        var ay = 0.0f;
+        var az = 0.0f;
+        var xi = x[i];
+        var yi = y[i];
+        var zi = z[i];
+
+        // Calculate gravitational forces from all other bodies
+        for j in 0..N {
+            if i != j {
+                var dx = x[j] - xi;
+                var dy = y[j] - yi;
+                var dz = z[j] - zi;
+
+                var dx2 = dx * dx;
+                var dy2 = dy * dy;
+                var dz2 = dz * dz;
+                var softening: float = 0.0000000001f;
+                var dist_sqr = dx2 + dy2 + dz2 + softening;
+                var dist = sqrtf(dist_sqr);
+                var force = 1.0f / (dist_sqr * dist);
+
+                ax = ax + (force * dx);
+                ay = ay + (force * dy);
+                az = az + (force * dz);
+            }
+        }
+
+        // Update velocities
+        vx[i] = vx[i] + ax * dt;
+        vy[i] = vy[i] + ay * dt;
+        vz[i] = vz[i] + az * dt;
+
+        // Update positions
+        x[i] = x[i] + vx[i] * dt;
+        y[i] = y[i] + vy[i] * dt;
+        z[i] = z[i] + vz[i] * dt;
+    }
+}
+
+// ========================================
+// Kernel 4: Mandelbrot Set (Complex)
+// ========================================
+
+@global
+fn mandelbrot_kernel(output: int*, width: int, height: int, max_iter: int) {
+    var px = block_id_x() * block_dim_x() + thread_id_x();
+    var py = block_id_y() * block_dim_y() + thread_id_y();
+
+    if px < width && py < height {
+        // Map pixel to complex plane
+        var x0 = ((float)px / (float)width) * 3.5f - 2.5f;
+        var y0 = ((float)py / (float)height) * 2.0f - 1.0f;
+
+        var x = 0.0f;
+        var y = 0.0f;
+        var iter = 0;
+
+        // Iterate z = z^2 + c
+        for i in 0..max_iter {
+            var x2 = x * x;
+            var y2 = y * y;
+            if (x2 + y2) > 4.0f {
+                break;
+            }
+            var xtemp = x2 - y2 + x0;
+            var xy2 = 2.0f * x * y;
+            y = xy2 + y0;
+            x = xtemp;
+            iter = iter + 1;
+        }
+        output[py * width + px] = iter;
+    }
+}
+
+// ========================================
+// Helper: Print GPU Info
+// ========================================
+
+fn print_gpu_info() {
+
+    var device_count = cuda_device_count();
+    "Found {device_count} CUDA device(s)\n";
+}
+
+// ========================================
+// Benchmark 1: Matrix Multiplication
+// ========================================
+
+fn benchmark_matrix_multiply(N: int) {
+    "Benchmark 1: Matrix Multiplication ({N}x{N}) ";
+
+    var size = N * N;
+
+    // Allocate host memory
+    "-> Allocating host memory...";
+    var h_A = alloc_n<float>(size);
+    var h_B = alloc_n<float>(size);
+    var h_C = alloc_n<float>(size);
+    defer free(h_A);
+    defer free(h_B);
+    defer free(h_C);
+
+    // Initialize matrices
+    "-> Initializing {N}x{N} matrices...";
+    for i in 0..size {
+        h_A[i] = (float)(i % 100) / 100.0f;
+        h_B[i] = (float)((i + 50) % 100) / 100.0f;
+    }
+
+    // Allocate device memory
+    "-> Allocating device memory...";
+    var d_A = cuda_alloc<float>(size);
+    var d_B = cuda_alloc<float>(size);
+    var d_C = cuda_alloc<float>(size);
+    defer cuda_free(d_A);
+    defer cuda_free(d_B);
+    defer cuda_free(d_C);
+
+    // Copy to device
+    "-> Copying data to GPU...";
+    cuda_copy_to_device(d_A, h_A, size * sizeof(float));
+    cuda_copy_to_device(d_B, h_B, size * sizeof(float));
+
+    // Configure grid
+    const BLOCK_SIZE = 16;
+    var blocks_per_grid = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    "-> Launching kernel: {blocks_per_grid}x{blocks_per_grid} blocks, {BLOCK_SIZE}x{BLOCK_SIZE} threads each";
+    " GPU IS NOW WORKING HARD - Check nvtop!\n";
+
+    var start = clock();
+
+    // Run multiple iterations to keep GPU busy
+    for iter in 0..10 {
+        launch matrix_multiply_kernel(d_A, d_B, d_C, N) with {
+            grid: blocks_per_grid,
+            block: BLOCK_SIZE
+        };
+        cuda_sync();
+        "  Iteration {iter + 1}/10 complete";
+    }
+
+    var elapsed = (clock() - start) / CLOCKS_PER_SEC;
+    " Completed in {elapsed} seconds\n";
+
+    // Copy result back
+    cuda_copy_to_host(h_C, d_C, size * sizeof(float));
+
+    "-> Sample results:";
+    "  C[0,0] = {h_C[0]}";
+    "  C[N-1,N-1] = {h_C[size - 1]}";
+}
+
+// ========================================
+// Benchmark 2: Monte Carlo Pi
+// ========================================
+
+fn benchmark_monte_carlo_pi(num_samples: u64) {
+    "Benchmark 2: Monte Carlo Pi Estimation ";
+
+    "-> Estimating Pi with {num_samples} samples";
+
+    const BLOCK_SIZE = 256;
+    const NUM_BLOCKS = 1024;
+    var total_threads = BLOCK_SIZE * NUM_BLOCKS;
+
+    // Allocate memory
+    var h_results = alloc_n<float>(total_threads);
+    defer free(h_results);
+
+    var d_results = cuda_alloc<float>(total_threads);
+    defer cuda_free(d_results);
+
+    "-> Launching kernel: {NUM_BLOCKS} blocks x {BLOCK_SIZE} threads";
+    " GPU IS NOW WORKING HARD - Check nvtop!\n";
+
+    var start = clock();
+
+    // Run many iterations
+    for iter in 0..100 {
+        var seed = (u64)time(NULL) + (u64)iter;
+
+        launch monte_carlo_pi_kernel(d_results, num_samples, seed) with {
+            grid: NUM_BLOCKS,
+            block: BLOCK_SIZE
+        };
+        cuda_sync();
+
+        if iter % 10 == 0 {
+            "  Iteration {iter}/100 complete";
+        }
+    }
+
+    var elapsed = (clock() - start) / CLOCKS_PER_SEC;
+    "\n Completed in {elapsed} seconds\n";
+
+    // Copy results and calculate Pi
+    cuda_copy_to_host(h_results, d_results, total_threads * sizeof(float));
+
+    var total_inside: u64 = 0;
+    for i in 0..total_threads {
+        total_inside = total_inside + (u64)h_results[i];
+    }
+
+    var pi_estimate = 4.0 * (double)total_inside / (double)num_samples;
+    var error = fabs(pi_estimate - 3.14159265359);
+
+    "-> Results:";
+    "  Estimated Pi: {pi_estimate}";
+    "  Actual Pi:    3.14159265359";
+    "  Error:        {error}";
+}
+
+// ========================================
+// Benchmark 3: N-Body Simulation
+// ========================================
+
+fn benchmark_nbody(num_bodies: int, num_steps: int) {
+
+    "Benchmark 3: N-Body Simulation ";
+
+    "-> Simulating {num_bodies} bodies for {num_steps} steps";
+
+    // Allocate host memory
+    var h_x = alloc_n<float>(num_bodies);
+    var h_y = alloc_n<float>(num_bodies);
+    var h_z = alloc_n<float>(num_bodies);
+    var h_vx = alloc_n<float>(num_bodies);
+    var h_vy = alloc_n<float>(num_bodies);
+    var h_vz = alloc_n<float>(num_bodies);
+    defer free(h_x);
+    defer free(h_y);
+    defer free(h_z);
+    defer free(h_vx);
+    defer free(h_vy);
+    defer free(h_vz);
+
+    // Initialize random positions
+    "-> Initializing random positions...";
+    srand(time(NULL));
+    for i in 0..num_bodies {
+        h_x[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+        h_y[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+        h_z[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f;
+        h_vx[i] = 0.0f;
+        h_vy[i] = 0.0f;
+        h_vz[i] = 0.0f;
+    }
+
+    // Allocate device memory
+    "-> Allocating device memory...";
+    var d_x = cuda_alloc<float>(num_bodies);
+    var d_y = cuda_alloc<float>(num_bodies);
+    var d_z = cuda_alloc<float>(num_bodies);
+    var d_vx = cuda_alloc<float>(num_bodies);
+    var d_vy = cuda_alloc<float>(num_bodies);
+    var d_vz = cuda_alloc<float>(num_bodies);
+    defer cuda_free(d_x);
+    defer cuda_free(d_y);
+    defer cuda_free(d_z);
+    defer cuda_free(d_vx);
+    defer cuda_free(d_vy);
+    defer cuda_free(d_vz);
+
+    // Copy to device
+    cuda_copy_to_device(d_x, h_x, num_bodies * sizeof(float));
+    cuda_copy_to_device(d_y, h_y, num_bodies * sizeof(float));
+    cuda_copy_to_device(d_z, h_z, num_bodies * sizeof(float));
+    cuda_copy_to_device(d_vx, h_vx, num_bodies * sizeof(float));
+    cuda_copy_to_device(d_vy, h_vy, num_bodies * sizeof(float));
+    cuda_copy_to_device(d_vz, h_vz, num_bodies * sizeof(float));
+
+    const BLOCK_SIZE = 256;
+    var num_blocks = (num_bodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    var dt = 0.01f;
+
+    "-> Launching simulation: {num_blocks} blocks x {BLOCK_SIZE} threads";
+    " GPU IS NOW WORKING HARD - Check nvtop!\n";
+
+    var start = clock();
+
+    for step in 0..num_steps {
+        launch nbody_kernel(d_x, d_y, d_z, d_vx, d_vy, d_vz, num_bodies, dt) with {
+            grid: num_blocks,
+            block: BLOCK_SIZE
+        };
+        cuda_sync();
+
+        if step % 100 == 0 {
+            "  Step {step}/{num_steps} complete";
+        }
+    }
+
+    var elapsed = (clock() - start) / CLOCKS_PER_SEC;
+    "\n Completed in {elapsed} seconds\n";
+
+    // Copy results back
+    cuda_copy_to_host(h_x, d_x, num_bodies * sizeof(float));
+    cuda_copy_to_host(h_y, d_y, num_bodies * sizeof(float));
+    cuda_copy_to_host(h_z, d_z, num_bodies * sizeof(float));
+
+    "-> Sample final positions:";
+    "  Body 0: ({h_x[0]}, {h_y[0]}, {h_z[0]})";
+    "  Body {num_bodies-1}: ({h_x[num_bodies-1]}, {h_y[num_bodies-1]}, {h_z[num_bodies-1]})";
+}
+
+
+fn main() {
+    "Zen-C GPU Benchmark Suite ";
+
+    print_gpu_info();
+
+    "\n RUN THIS NOW: Open another terminal and run: nvtop";
+    "\nPress Enter to start benchmarks...";
+    getchar();
+
+    // Run all benchmarks
+    benchmark_matrix_multiply(2048);
+
+    "\n  Pause (5 seconds)...";
+    sleep(5);
+
+    benchmark_monte_carlo_pi(10000000000);
+
+    "\n Pause (5 seconds)...";
+    sleep(5);
+
+    benchmark_nbody(4096, 1000);
+
+    "Zen-C GPU Benchmark Suite - All tests completed.";
+}
+\ No newline at end of file