// ======================================== // GPU Benchmark - Zen-C CUDA // ======================================== import "std/cuda.zc" import "std/mem.zc" // ======================================== // Kernel 1: Matrix Multiplication // ======================================== @global fn matrix_multiply_kernel(A: float*, B: float*, C: float*, N: int) { let row = block_id_y() * block_dim_y() + thread_id_y(); let col = block_id_x() * block_dim_x() + thread_id_x(); if row < N && col < N { let sum = 0.0f; for k in 0..N { sum = sum + A[row * N + k] * B[k * N + col]; } C[row * N + col] = sum; } } // ======================================== // Kernel 2: Monte Carlo Pi Estimation // ======================================== @global fn monte_carlo_pi_kernel(results: float*, num_samples: u64, seed: u64) { let idx = block_id_x() * block_dim_x() + thread_id_x(); let total_threads = grid_dim_x() * block_dim_x(); let local_count: u64 = 0; let samples_per_thread = num_samples / total_threads; // Simple random number generator let rand_state = seed + idx; for i in 0..samples_per_thread { // Generate random x and y in [0, 1] let a: u64 = 1103515245u64; let b: u64 = 12345u64; let m: u64 = 2147483648u64; rand_state = (a * rand_state + b) % m; let x = (float)rand_state / 2147483648.0f; rand_state = (a * rand_state + b) % m; let y = (float)rand_state / 2147483648.0f; // Check if point is inside quarter circle if x * x + y * y <= 1.0f { local_count = local_count + 1; } } results[idx] = (float)local_count; } // ======================================== // Kernel 3: N-Body Simulation // ======================================== @global fn nbody_kernel(x: float*, y: float*, z: float*, vx: float*, vy: float*, vz: float*, N: int, dt: float) { let i = thread_id(); if i < N { let ax = 0.0f; let ay = 0.0f; let az = 0.0f; let xi = x[i]; let yi = y[i]; let zi = z[i]; // Calculate gravitational forces from all other bodies for j in 0..N { if i != j { let dx = x[j] - xi; let dy = y[j] - yi; let dz = z[j] - zi; let dx2 = dx * dx; let dy2 = dy * dy; let dz2 = dz * dz; let softening: float = 0.0000000001f; let dist_sqr = dx2 + dy2 + dz2 + softening; let dist = sqrtf(dist_sqr); let force = 1.0f / (dist_sqr * dist); ax = ax + (force * dx); ay = ay + (force * dy); az = az + (force * dz); } } // Update velocities vx[i] = vx[i] + ax * dt; vy[i] = vy[i] + ay * dt; vz[i] = vz[i] + az * dt; // Update positions x[i] = x[i] + vx[i] * dt; y[i] = y[i] + vy[i] * dt; z[i] = z[i] + vz[i] * dt; } } // ======================================== // Kernel 4: Mandelbrot Set (Complex) // ======================================== @global fn mandelbrot_kernel(output: int*, width: int, height: int, max_iter: int) { let px = block_id_x() * block_dim_x() + thread_id_x(); let py = block_id_y() * block_dim_y() + thread_id_y(); if px < width && py < height { // Map pixel to complex plane let x0 = ((float)px / (float)width) * 3.5f - 2.5f; let y0 = ((float)py / (float)height) * 2.0f - 1.0f; let x = 0.0f; let y = 0.0f; let iter = 0; // Iterate z = z^2 + c for i in 0..max_iter { let x2 = x * x; let y2 = y * y; if (x2 + y2) > 4.0f { break; } let xtemp = x2 - y2 + x0; let xy2 = 2.0f * x * y; y = xy2 + y0; x = xtemp; iter = iter + 1; } output[py * width + px] = iter; } } // ======================================== // Helper: Print GPU Info // ======================================== fn print_gpu_info() { let device_count = cuda_device_count(); "Found {device_count} CUDA device(s)\n"; } // ======================================== // Benchmark 1: Matrix Multiplication // ======================================== fn benchmark_matrix_multiply(N: int) { "Benchmark 1: Matrix Multiplication ({N}x{N}) "; let size = N * N; // Allocate host memory "-> Allocating host memory..."; let h_A = alloc_n(size); let h_B = alloc_n(size); let h_C = alloc_n(size); defer free(h_A); defer free(h_B); defer free(h_C); // Initialize matrices "-> Initializing {N}x{N} matrices..."; for i in 0..size { h_A[i] = (float)(i % 100) / 100.0f; h_B[i] = (float)((i + 50) % 100) / 100.0f; } // Allocate device memory "-> Allocating device memory..."; let d_A = cuda_alloc(size); let d_B = cuda_alloc(size); let d_C = cuda_alloc(size); defer cuda_free(d_A); defer cuda_free(d_B); defer cuda_free(d_C); // Copy to device "-> Copying data to GPU..."; cuda_copy_to_device(d_A, h_A, size * sizeof(float)); cuda_copy_to_device(d_B, h_B, size * sizeof(float)); // Configure grid def BLOCK_SIZE = 16; let blocks_per_grid = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; "-> Launching kernel: {blocks_per_grid}x{blocks_per_grid} blocks, {BLOCK_SIZE}x{BLOCK_SIZE} threads each"; " GPU IS NOW WORKING HARD - Check nvtop!\n"; let start = clock(); // Run multiple iterations to keep GPU busy for iter in 0..10 { launch matrix_multiply_kernel(d_A, d_B, d_C, N) with { grid: blocks_per_grid, block: BLOCK_SIZE }; cuda_sync(); " Iteration {iter + 1}/10 complete"; } let elapsed = (clock() - start) / CLOCKS_PER_SEC; " Completed in {elapsed} seconds\n"; // Copy result back cuda_copy_to_host(h_C, d_C, size * sizeof(float)); "-> Sample results:"; " C[0,0] = {h_C[0]}"; " C[N-1,N-1] = {h_C[size - 1]}"; } // ======================================== // Benchmark 2: Monte Carlo Pi // ======================================== fn benchmark_monte_carlo_pi(num_samples: u64) { "Benchmark 2: Monte Carlo Pi Estimation "; "-> Estimating Pi with {num_samples} samples"; def BLOCK_SIZE = 256; def NUM_BLOCKS = 1024; let total_threads = BLOCK_SIZE * NUM_BLOCKS; // Allocate memory let h_results = alloc_n(total_threads); defer free(h_results); let d_results = cuda_alloc(total_threads); defer cuda_free(d_results); "-> Launching kernel: {NUM_BLOCKS} blocks x {BLOCK_SIZE} threads"; " GPU IS NOW WORKING HARD - Check nvtop!\n"; let start = clock(); // Run many iterations for iter in 0..100 { let seed = (u64)time(NULL) + (u64)iter; launch monte_carlo_pi_kernel(d_results, num_samples, seed) with { grid: NUM_BLOCKS, block: BLOCK_SIZE }; cuda_sync(); if iter % 10 == 0 { " Iteration {iter}/100 complete"; } } let elapsed = (clock() - start) / CLOCKS_PER_SEC; "\n Completed in {elapsed} seconds\n"; // Copy results and calculate Pi cuda_copy_to_host(h_results, d_results, total_threads * sizeof(float)); let total_inside: u64 = 0; for i in 0..total_threads { total_inside = total_inside + (u64)h_results[i]; } let pi_estimate = 4.0 * (double)total_inside / (double)num_samples; let error = fabs(pi_estimate - 3.14159265359); "-> Results:"; " Estimated Pi: {pi_estimate}"; " Actual Pi: 3.14159265359"; " Error: {error}"; } // ======================================== // Benchmark 3: N-Body Simulation // ======================================== fn benchmark_nbody(num_bodies: int, num_steps: int) { "Benchmark 3: N-Body Simulation "; "-> Simulating {num_bodies} bodies for {num_steps} steps"; // Allocate host memory let h_x = alloc_n(num_bodies); let h_y = alloc_n(num_bodies); let h_z = alloc_n(num_bodies); let h_vx = alloc_n(num_bodies); let h_vy = alloc_n(num_bodies); let h_vz = alloc_n(num_bodies); defer free(h_x); defer free(h_y); defer free(h_z); defer free(h_vx); defer free(h_vy); defer free(h_vz); // Initialize random positions "-> Initializing random positions..."; srand(time(NULL)); for i in 0..num_bodies { h_x[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; h_y[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; h_z[i] = ((float)rand() / (float)RAND_MAX) * 2.0f - 1.0f; h_vx[i] = 0.0f; h_vy[i] = 0.0f; h_vz[i] = 0.0f; } // Allocate device memory "-> Allocating device memory..."; let d_x = cuda_alloc(num_bodies); let d_y = cuda_alloc(num_bodies); let d_z = cuda_alloc(num_bodies); let d_vx = cuda_alloc(num_bodies); let d_vy = cuda_alloc(num_bodies); let d_vz = cuda_alloc(num_bodies); defer cuda_free(d_x); defer cuda_free(d_y); defer cuda_free(d_z); defer cuda_free(d_vx); defer cuda_free(d_vy); defer cuda_free(d_vz); // Copy to device cuda_copy_to_device(d_x, h_x, num_bodies * sizeof(float)); cuda_copy_to_device(d_y, h_y, num_bodies * sizeof(float)); cuda_copy_to_device(d_z, h_z, num_bodies * sizeof(float)); cuda_copy_to_device(d_vx, h_vx, num_bodies * sizeof(float)); cuda_copy_to_device(d_vy, h_vy, num_bodies * sizeof(float)); cuda_copy_to_device(d_vz, h_vz, num_bodies * sizeof(float)); def BLOCK_SIZE = 256; let num_blocks = (num_bodies + BLOCK_SIZE - 1) / BLOCK_SIZE; let dt = 0.01f; "-> Launching simulation: {num_blocks} blocks x {BLOCK_SIZE} threads"; " GPU IS NOW WORKING HARD - Check nvtop!\n"; let start = clock(); for step in 0..num_steps { launch nbody_kernel(d_x, d_y, d_z, d_vx, d_vy, d_vz, num_bodies, dt) with { grid: num_blocks, block: BLOCK_SIZE }; cuda_sync(); if step % 100 == 0 { " Step {step}/{num_steps} complete"; } } let elapsed = (clock() - start) / CLOCKS_PER_SEC; "\n Completed in {elapsed} seconds\n"; // Copy results back cuda_copy_to_host(h_x, d_x, num_bodies * sizeof(float)); cuda_copy_to_host(h_y, d_y, num_bodies * sizeof(float)); cuda_copy_to_host(h_z, d_z, num_bodies * sizeof(float)); "-> Sample final positions:"; " Body 0: ({h_x[0]}, {h_y[0]}, {h_z[0]})"; " Body {num_bodies-1}: ({h_x[num_bodies-1]}, {h_y[num_bodies-1]}, {h_z[num_bodies-1]})"; } fn main() { "Zen-C GPU Benchmark Suite "; print_gpu_info(); "\n RUN THIS NOW: Open another terminal and run: nvtop"; "\nPress Enter to start benchmarks..."; getchar(); // Run all benchmarks benchmark_matrix_multiply(2048); "\n Pause (5 seconds)..."; sleep(5); benchmark_monte_carlo_pi(10000000000); "\n Pause (5 seconds)..."; sleep(5); benchmark_nbody(4096, 1000); "Zen-C GPU Benchmark Suite - All tests completed."; }