From a18db1f20789d12473a87d1bf5b8fbe3899c0cfb Mon Sep 17 00:00:00 2001 From: suresh Date: Mon, 19 Jan 2026 15:34:36 -0500 Subject: cuda benchmark example --- examples/gpu/cuda-benchmark.zc | 403 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 examples/gpu/cuda-benchmark.zc diff --git a/examples/gpu/cuda-benchmark.zc b/examples/gpu/cuda-benchmark.zc new file mode 100644 index 0000000..d426e10 --- /dev/null +++ b/examples/gpu/cuda-benchmark.zc @@ -0,0 +1,403 @@ +// ======================================== +// GPU Benchmark - Zen-C CUDA +// ======================================== + +import "std/cuda.zc" +import "std/mem.zc" + +// ======================================== +// Kernel 1: Matrix Multiplication +// ======================================== + +@global +fn matrix_multiply_kernel(A: float*, B: float*, C: float*, N: int) { + var row = block_id_y() * block_dim_y() + thread_id_y(); + var col = block_id_x() * block_dim_x() + thread_id_x(); + + if row < N && col < N { + var sum = 0.0f; + for k in 0..N { + sum = sum + A[row * N + k] * B[k * N + col]; + } + C[row * N + col] = sum; + } +} + +// ======================================== +// Kernel 2: Monte Carlo Pi Estimation +// ======================================== + +@global +fn monte_carlo_pi_kernel(results: float*, num_samples: u64, seed: u64) { + var idx = block_id_x() * block_dim_x() + thread_id_x(); + var total_threads = grid_dim_x() * block_dim_x(); + + var local_count: u64 = 0; + var samples_per_thread = num_samples / total_threads; + + // Simple random number generator + var rand_state = seed + idx; + + for i in 0..samples_per_thread { + // Generate random x and y in [0, 1] + var a: u64 = 1103515245u64; + var b: u64 = 12345u64; + var m: u64 = 2147483648u64; + + rand_state = (a * rand_state + b) % m; + var x = (float)rand_state / 2147483648.0f; + + rand_state = (a * rand_state + b) % m; + var y = (float)rand_state / 2147483648.0f; + + // Check if point is inside quarter circle + if x * x + y * y <= 1.0f { + local_count = local_count + 1; + } + } + results[idx] = (float)local_count; +} + +// ======================================== +// Kernel 3: N-Body Simulation +// ======================================== + +@global +fn nbody_kernel(x: float*, y: float*, z: float*, + vx: float*, vy: float*, vz: float*, + N: int, dt: float) { + var i = thread_id(); + + if i < N { + var ax = 0.0f; + var ay = 0.0f; + var az = 0.0f; + var xi = x[i]; + var yi = y[i]; + var zi = z[i]; + + // Calculate gravitational forces from all other bodies + for j in 0..N { + if i != j { + var dx = x[j] - xi; + var dy = y[j] - yi; + var dz = z[j] - zi; + + var dx2 = dx * dx; + var dy2 = dy * dy; + var dz2 = dz * dz; + var softening: float = 0.0000000001f; + var dist_sqr = dx2 + dy2 + dz2 + softening; + var dist = sqrtf(dist_sqr); + var force = 1.0f / (dist_sqr * dist); + + ax = ax + (force * dx); + ay = ay + (force * dy); + az = az + (force * dz); + } + } + + // Update velocities + vx[i] = vx[i] + ax * dt; + vy[i] = vy[i] + ay * dt; + vz[i] = vz[i] + az * dt; + + // Update positions + x[i] = x[i] + vx[i] * dt; + y[i] = y[i] + vy[i] * dt; + z[i] = z[i] + vz[i] * dt; + } +} + +// ======================================== +// Kernel 4: Mandelbrot Set (Complex) +// ======================================== + +@global +fn mandelbrot_kernel(output: int*, width: int, height: int, max_iter: int) { + var px = block_id_x() * block_dim_x() + thread_id_x(); + var py = block_id_y() * block_dim_y() + thread_id_y(); + + if px < width && py < height { + // Map pixel to complex plane + var x0 = ((float)px / (float)width) * 3.5f - 2.5f; + var y0 = ((float)py / (float)height) * 2.0f - 1.0f; + + var x = 0.0f; + var y = 0.0f; + var iter = 0; + + // Iterate z = z^2 + c + for i in 0..max_iter { + var x2 = x * x; + var y2 = y * y; + if (x2 + y2) > 4.0f { + break; + } + var xtemp = x2 - y2 + x0; + var xy2 = 2.0f * x * y; + y = xy2 + y0; + x = xtemp; + iter = iter + 1; + } + output[py * width + px] = iter; + } +} + +// ======================================== +// Helper: Print GPU Info +// ======================================== + +fn print_gpu_info() { + + var device_count = cuda_device_count(); + "Found {device_count} CUDA device(s)\n"; +} + +// ======================================== +// Benchmark 1: Matrix Multiplication +// ======================================== + +fn benchmark_matrix_multiply(N: int) { + "Benchmark 1: Matrix Multiplication ({N}x{N}) "; + + var size = N * N; + + // Allocate host memory + "-> Allocating host memory..."; + var h_A = alloc_n(size); + var h_B = alloc_n(size); + var h_C = alloc_n(size); + defer free(h_A); + defer free(h_B); + defer free(h_C); + + // Initialize matrices + "-> Initializing {N}x{N} matrices..."; + for i in 0..size { + h_A[i] = (float)(i % 100) / 100.0f; + h_B[i] = (float)((i + 50) % 100) / 100.0f; + } + + // Allocate device memory + "-> Allocating device memory..."; + var d_A = cuda_alloc(size); + var d_B = cuda_alloc(size); + var d_C = cuda_alloc(size); + defer cuda_free(d_A); + defer cuda_free(d_B); + defer cuda_free(d_C); + + // Copy to device + "-> Copying data to GPU..."; + cuda_copy_to_device(d_A, h_A, size * sizeof(float)); + cuda_copy_to_device(d_B, h_B, size * sizeof(float)); + + // Configure grid + const BLOCK_SIZE = 16; + var blocks_per_grid = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; + + "-> Launching kernel: {blocks_per_grid}x{blocks_per_grid} blocks, {BLOCK_SIZE}x{BLOCK_SIZE} threads each"; + " GPU IS NOW WORKING HARD - Check nvtop!\n"; + + var start = clock(); + + // Run multiple iterations to keep GPU busy + for iter in 0..10 { + launch matrix_multiply_kernel(d_A, d_B, d_C, N) with { + grid: blocks_per_grid, + block: BLOCK_SIZE + }; + cuda_sync(); + " Iteration {iter + 1}/10 complete"; + } + + var elapsed = (clock() - start) / CLOCKS_PER_SEC; + " Completed in {elapsed} seconds\n"; + + // Copy result back + cuda_copy_to_host(h_C, d_C, size * sizeof(float)); + + "-> Sample results:"; + " C[0,0] = {h_C[0]}"; + " C[N-1,N-1] = {h_C[size - 1]}"; +} + +// ======================================== +// Benchmark 2: Monte Carlo Pi +// ======================================== + +fn benchmark_monte_carlo_pi(num_samples: u64) { + "Benchmark 2: Monte Carlo Pi Estimation "; + + "-> Estimating Pi with {num_samples} samples"; + + const BLOCK_SIZE = 256; + const NUM_BLOCKS = 1024; + var total_threads = BLOCK_SIZE * NUM_BLOCKS; + + // Allocate memory + var h_results = alloc_n(total_threads); + defer free(h_results); + + var d_results = cuda_alloc(total_threads); + defer cuda_free(d_results); + + "-> Launching kernel: {NUM_BLOCKS} blocks x {BLOCK_SIZE} threads"; + " GPU IS NOW WORKING HARD - Check nvtop!\n"; + + var start = clock(); + + // Run many iterations + for iter in 0..100 { + var seed = (u64)time(NULL) + (u64)iter; + + launch monte_carlo_pi_kernel(d_results, num_samples, seed) with { + grid: NUM_BLOCKS, + block: BLOCK_SIZE + }; + cuda_sync(); + + if iter % 10 == 0 { + " Iteration {iter}/100 complete"; + } + } + + var elapsed = (clock() - start) / CLOCKS_PER_SEC; + "\n Completed in {elapsed} seconds\n"; + + // Copy results and calculate Pi + cuda_copy_to_host(h_results, d_results, total_threads * sizeof(float)); + + var total_inside: u64 = 0; + for i in 0..total_threads { + total_inside = total_inside + (u64)h_results[i]; + } + + var pi_estimate = 4.0 * (double)total_inside / (double)num_samples; + var error = fabs(pi_estimate - 3.14159265359); + + "-> Results:"; + " Estimated Pi: {pi_estimate}"; + " Actual Pi: 3.14159265359"; + " Error: {error}"; +} + +// ======================================== +// Benchmark 3: N-Body Simulation +// ======================================== + +fn benchmark_nbody(num_bodies: int, num_steps: int) { + + "Benchmark 3: N-Body Simulation "; + + "-> Simulating {num_bodies} bodies for {num_steps} steps"; + + // Allocate host memory + var h_x = alloc_n(num_bodies); + var h_y = alloc_n(num_bodies); + var h_z = alloc_n(num_bodies); + var h_vx = alloc_n(num_bodies); + var h_vy = alloc_n(num_bodies); + var h_vz = alloc_n(num_bodies); + defer free(h_x); + defer free(h_y); + defer free(h_z); + defer free(h_vx); + defer free(h_vy); + defer free(h_vz); + + // Initialize random positions + "-> Initializing random positions..."; + srand(time(NULL)); + for i in 0..num_bodies { + h_x[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + h_y[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + h_z[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; + h_vx[i] = 0.0f; + h_vy[i] = 0.0f; + h_vz[i] = 0.0f; + } + + // Allocate device memory + "-> Allocating device memory..."; + var d_x = cuda_alloc(num_bodies); + var d_y = cuda_alloc(num_bodies); + var d_z = cuda_alloc(num_bodies); + var d_vx = cuda_alloc(num_bodies); + var d_vy = cuda_alloc(num_bodies); + var d_vz = cuda_alloc(num_bodies); + defer cuda_free(d_x); + defer cuda_free(d_y); + defer cuda_free(d_z); + defer cuda_free(d_vx); + defer cuda_free(d_vy); + defer cuda_free(d_vz); + + // Copy to device + cuda_copy_to_device(d_x, h_x, num_bodies * sizeof(float)); + cuda_copy_to_device(d_y, h_y, num_bodies * sizeof(float)); + cuda_copy_to_device(d_z, h_z, num_bodies * sizeof(float)); + cuda_copy_to_device(d_vx, h_vx, num_bodies * sizeof(float)); + cuda_copy_to_device(d_vy, h_vy, num_bodies * sizeof(float)); + cuda_copy_to_device(d_vz, h_vz, num_bodies * sizeof(float)); + + const BLOCK_SIZE = 256; + var num_blocks = (num_bodies + BLOCK_SIZE - 1) / BLOCK_SIZE; + var dt = 0.01f; + + "-> Launching simulation: {num_blocks} blocks x {BLOCK_SIZE} threads"; + " GPU IS NOW WORKING HARD - Check nvtop!\n"; + + var start = clock(); + + for step in 0..num_steps { + launch nbody_kernel(d_x, d_y, d_z, d_vx, d_vy, d_vz, num_bodies, dt) with { + grid: num_blocks, + block: BLOCK_SIZE + }; + cuda_sync(); + + if step % 100 == 0 { + " Step {step}/{num_steps} complete"; + } + } + + var elapsed = (clock() - start) / CLOCKS_PER_SEC; + "\n Completed in {elapsed} seconds\n"; + + // Copy results back + cuda_copy_to_host(h_x, d_x, num_bodies * sizeof(float)); + cuda_copy_to_host(h_y, d_y, num_bodies * sizeof(float)); + cuda_copy_to_host(h_z, d_z, num_bodies * sizeof(float)); + + "-> Sample final positions:"; + " Body 0: ({h_x[0]}, {h_y[0]}, {h_z[0]})"; + " Body {num_bodies-1}: ({h_x[num_bodies-1]}, {h_y[num_bodies-1]}, {h_z[num_bodies-1]})"; +} + + +fn main() { + "Zen-C GPU Benchmark Suite "; + + print_gpu_info(); + + "\n RUN THIS NOW: Open another terminal and run: nvtop"; + "\nPress Enter to start benchmarks..."; + getchar(); + + // Run all benchmarks + benchmark_matrix_multiply(2048); + + "\n Pause (5 seconds)..."; + sleep(5); + + benchmark_monte_carlo_pi(10000000000); + + "\n Pause (5 seconds)..."; + sleep(5); + + benchmark_nbody(4096, 1000); + + "Zen-C GPU Benchmark Suite - All tests completed."; +} \ No newline at end of file -- cgit v1.2.3