diff options
| -rw-r--r-- | README.md | 90 | ||||
| -rw-r--r-- | examples/gpu/cuda_vector_add.zc | 73 | ||||
| -rw-r--r-- | src/ast/ast.h | 16 | ||||
| -rw-r--r-- | src/codegen/codegen.c | 64 | ||||
| -rw-r--r-- | src/codegen/codegen_utils.c | 17 | ||||
| -rw-r--r-- | src/main.c | 7 | ||||
| -rw-r--r-- | src/parser/parser_core.c | 24 | ||||
| -rw-r--r-- | src/parser/parser_stmt.c | 111 | ||||
| -rw-r--r-- | src/zprep.h | 1 | ||||
| -rw-r--r-- | std/cuda.zc | 113 | ||||
| -rw-r--r-- | std/mem.zc | 6 |
11 files changed, 517 insertions, 5 deletions
@@ -53,6 +53,10 @@ Join the discussion, share demos, ask questions, or report bugs in the official - [14. Inline Assembly](#14-inline-assembly) - [15. Build Directives](#15-build-directives) - [Compiler Support & Compatibility](#compiler-support--compatibility) + - [Test Suite Status](#test-suite-status) + - [Building with Zig](#building-with-zig) + - [C++ Interop](#c-interop) + - [CUDA Interop](#cuda-interop) - [Contributing](#contributing) --- @@ -680,6 +684,92 @@ fn main() { > **Note:** The `--cpp` flag switches the backend to `g++` and emits C++-compatible code (uses `auto` instead of `__auto_type`, function overloads instead of `_Generic`, and explicit casts for `void*`). +### CUDA Interop + +Zen C can generate CUDA-compatible code with the `--cuda` flag, allowing you to write GPU kernels using Zen C syntax. + +```bash +# Direct compilation with nvcc +zc run app.zc --cuda + +# Or transpile for manual build +zc transpile app.zc --cuda -o app.cu +nvcc app.cu -o app +``` + +#### CUDA-Specific Attributes + +| Attribute | CUDA Equivalent | Description | +|:---|:---|:---| +| `@global` | `__global__` | Kernel function (runs on GPU, called from host) | +| `@device` | `__device__` | Device function (runs on GPU, called from GPU) | +| `@host` | `__host__` | Host function (explicit CPU-only) | + +#### Kernel Launch Syntax + +Zen C provides a clean `launch` statement for invoking CUDA kernels: + +```zc +launch kernel_name(args) with { + grid: num_blocks, + block: threads_per_block, + shared_mem: 1024, // Optional + stream: my_stream // Optional +}; +``` + +This transpiles to: `kernel_name<<<grid, block, shared, stream>>>(args);` + +#### Writing CUDA Kernels + +Use Zen C function syntax with `@global` and the `launch` statement: + +```zc +include <cuda_runtime.h> +import "std/cuda.zc" + +@global +fn add_kernel(a: float*, b: float*, c: float*, n: int) { + // You can use raw CUDA C or the std/cuda.zc helpers + var i = thread_id(); + if i < n { + c[i] = a[i] + b[i]; + } +} + +fn main() { + // ... allocation ... + launch add_kernel(d_a, d_b, d_c, N) with { + grid: num_blocks, + block: 256 + }; + // ... cleanup ... +} +``` + +#### Standard Library (`std/cuda.zc`) +Zen C provides a standard library for common CUDA operations to reduce `raw` blocks: + +```zc +import "std/cuda.zc" + +// Memory management +var d_ptr = cuda_alloc<float>(1024); +cuda_copy_to_device(d_ptr, h_ptr, 1024 * sizeof(float)); +defer cuda_free(d_ptr); + +// Synchronization +cuda_sync(); + +// Thread Indexing (use inside kernels) +var i = thread_id(); // Global index +var bid = block_id(); +var tid = local_id(); +``` + + +> **Note:** The `--cuda` flag sets `nvcc` as the compiler and implies `--cpp` mode. Requires the NVIDIA CUDA Toolkit. + --- ## Contributing diff --git a/examples/gpu/cuda_vector_add.zc b/examples/gpu/cuda_vector_add.zc new file mode 100644 index 0000000..de75a74 --- /dev/null +++ b/examples/gpu/cuda_vector_add.zc @@ -0,0 +1,73 @@ + +// Compile with: zc run cuda_vector_add.zc --cuda + +//> cflags: -arch=sm_75 + +import "std/cuda.zc" +import "std/mem.zc" + +@global +fn add_kernel(a: float*, b: float*, c: float*, n: int) { + var i = thread_id(); + if i < n { + c[i] = a[i] + b[i]; + } +} + +fn main() { + const N = 1024; + + "=> Zen C CUDA Vector Addition"; + "-> Vector size: {N} elements"; + + var h_a = alloc_n<float>(N); + var h_b = alloc_n<float>(N); + var h_c = alloc_n<float>(N); + defer free(h_a); + defer free(h_b); + defer free(h_c); + + for i in 0..N { + h_a[i] = (float)i; + h_b[i] = (float)(i * 2); + } + + "-> Allocating device memory..."; + var d_a = cuda_alloc<float>(N); + var d_b = cuda_alloc<float>(N); + var d_c = cuda_alloc<float>(N); + defer cuda_free(d_a); + defer cuda_free(d_b); + defer cuda_free(d_c); + + cuda_copy_to_device(d_a, h_a, N * sizeof(float)); + cuda_copy_to_device(d_b, h_b, N * sizeof(float)); + + const BLOCK_SIZE = 256; + var num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; + + "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads"; + + launch add_kernel(d_a, d_b, d_c, N) with { + grid: num_blocks, + block: BLOCK_SIZE + }; + + cuda_sync(); + + cuda_copy_to_host(h_c, d_c, N * sizeof(float)); + + "-> Verifying..."; + var ok: int = 1; + for i in 0..10 { + var expected = h_a[i] + h_b[i]; + if h_c[i] != expected { + !"-> Mismatch at {i}"; + ok = 0; + } + } + + if ok { + "-> All checks passed!"; + } +} diff --git a/src/ast/ast.h b/src/ast/ast.h index 2288860..cef68c6 100644 --- a/src/ast/ast.h +++ b/src/ast/ast.h @@ -122,7 +122,8 @@ typedef enum NODE_TRY, NODE_REFLECTION, NODE_AWAIT, - NODE_REPL_PRINT + NODE_REPL_PRINT, + NODE_CUDA_LAUNCH } NodeType; // ** AST Node Structure ** @@ -176,6 +177,10 @@ struct ASTNode char *section; // @section("name") int is_async; // async function int is_comptime; // @comptime function + // CUDA qualifiers + int cuda_global; // @global -> __global__ + int cuda_device; // @device -> __device__ + int cuda_host; // @host -> __host__ } func; struct @@ -539,6 +544,15 @@ struct ASTNode { ASTNode *expr; } repl_print; + + struct + { + ASTNode *call; // The kernel call (NODE_EXPR_CALL) + ASTNode *grid; // Grid dimensions expression + ASTNode *block; // Block dimensions expression + ASTNode *shared_mem; // Optional shared memory size (NULL = default) + ASTNode *stream; // Optional CUDA stream (NULL = default) + } cuda_launch; }; }; diff --git a/src/codegen/codegen.c b/src/codegen/codegen.c index 644e9f5..a371548 100644 --- a/src/codegen/codegen.c +++ b/src/codegen/codegen.c @@ -2252,6 +2252,70 @@ void codegen_node_single(ParserContext *ctx, ASTNode *node, FILE *out) fprintf(out, ";\n"); } break; + case NODE_CUDA_LAUNCH: + { + // Emit CUDA kernel launch: kernel<<<grid, block, shared, stream>>>(args); + ASTNode *call = node->cuda_launch.call; + + // Get kernel name from callee + if (call->call.callee->type == NODE_EXPR_VAR) + { + fprintf(out, " %s<<<", call->call.callee->var_ref.name); + } + else + { + fprintf(out, " "); + codegen_expression(ctx, call->call.callee, out); + fprintf(out, "<<<"); + } + + // Grid dimension + codegen_expression(ctx, node->cuda_launch.grid, out); + fprintf(out, ", "); + + // Block dimension + codegen_expression(ctx, node->cuda_launch.block, out); + + // Optional shared memory size + if (node->cuda_launch.shared_mem || node->cuda_launch.stream) + { + fprintf(out, ", "); + if (node->cuda_launch.shared_mem) + { + codegen_expression(ctx, node->cuda_launch.shared_mem, out); + } + else + { + fprintf(out, "0"); + } + } + + // Optional CUDA stream + if (node->cuda_launch.stream) + { + fprintf(out, ", "); + codegen_expression(ctx, node->cuda_launch.stream, out); + } + + fprintf(out, ">>>("); + + // Arguments + ASTNode *arg = call->call.args; + int first = 1; + while (arg) + { + if (!first) + { + fprintf(out, ", "); + } + codegen_expression(ctx, arg, out); + first = 0; + arg = arg->next; + } + + fprintf(out, ");\n"); + break; + } default: codegen_expression(ctx, node, out); fprintf(out, ";\n"); diff --git a/src/codegen/codegen_utils.c b/src/codegen/codegen_utils.c index b1fcf4c..af1c862 100644 --- a/src/codegen/codegen_utils.c +++ b/src/codegen/codegen_utils.c @@ -535,6 +535,23 @@ void emit_func_signature(FILE *out, ASTNode *func, const char *name_override) return; } + // Emit CUDA qualifiers (for both forward declarations and definitions) + if (g_config.use_cuda) + { + if (func->func.cuda_global) + { + fprintf(out, "__global__ "); + } + if (func->func.cuda_device) + { + fprintf(out, "__device__ "); + } + if (func->func.cuda_host) + { + fprintf(out, "__host__ "); + } + } + // Return type char *ret_str; if (func->func.ret_type_info) @@ -48,6 +48,7 @@ void print_usage() printf(" -q, --quiet Quiet output\n"); printf(" -c Compile only (produce .o)\n"); printf(" --cpp Use C++ mode.\n"); + printf(" --cuda Use CUDA mode (requires nvcc).\n"); } int main(int argc, char **argv) @@ -147,6 +148,12 @@ int main(int argc, char **argv) strcpy(g_config.cc, "g++"); g_config.use_cpp = 1; } + else if (strcmp(arg, "--cuda") == 0) + { + strcpy(g_config.cc, "nvcc"); + g_config.use_cuda = 1; + g_config.use_cpp = 1; // CUDA implies C++ mode + } else if (strcmp(arg, "--check") == 0) { g_config.mode_check = 1; diff --git a/src/parser/parser_core.c b/src/parser/parser_core.c index 3e683fb..c3c91fe 100644 --- a/src/parser/parser_core.c +++ b/src/parser/parser_core.c @@ -63,6 +63,9 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l) int attr_weak = 0; int attr_export = 0; int attr_comptime = 0; + int attr_cuda_global = 0; // @global -> __global__ + int attr_cuda_device = 0; // @device -> __device__ + int attr_cuda_host = 0; // @host -> __host__ char *deprecated_msg = NULL; char *attr_section = NULL; @@ -232,7 +235,23 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l) } else { - zwarn_at(attr, "Unknown attribute: %.*s", attr.len, attr.start); + // Checking for CUDA attributes... + if (0 == strncmp(attr.start, "global", 6) && 6 == attr.len) + { + attr_cuda_global = 1; + } + else if (0 == strncmp(attr.start, "device", 6) && 6 == attr.len) + { + attr_cuda_device = 1; + } + else if (0 == strncmp(attr.start, "host", 4) && 4 == attr.len) + { + attr_cuda_host = 1; + } + else + { + zwarn_at(attr, "Unknown attribute: %.*s", attr.len, attr.start); + } } t = lexer_peek(l); @@ -469,6 +488,9 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l) s->func.pure = attr_pure; s->func.section = attr_section; s->func.is_comptime = attr_comptime; + s->func.cuda_global = attr_cuda_global; + s->func.cuda_device = attr_cuda_device; + s->func.cuda_host = attr_cuda_host; if (attr_deprecated && s->func.name) { diff --git a/src/parser/parser_stmt.c b/src/parser/parser_stmt.c index 5307768..daf3f72 100644 --- a/src/parser/parser_stmt.c +++ b/src/parser/parser_stmt.c @@ -2503,6 +2503,117 @@ ASTNode *parse_statement(ParserContext *ctx, Lexer *l) return parse_guard(ctx, l); } + // CUDA launch: launch kernel(args) with { grid: X, block: Y }; + if (strncmp(tk.start, "launch", 6) == 0 && tk.len == 6) + { + Token launch_tok = lexer_next(l); // eat 'launch' + + // Parse the kernel call expression + ASTNode *call = parse_expression(ctx, l); + if (!call || call->type != NODE_EXPR_CALL) + { + zpanic_at(launch_tok, "Expected kernel call after 'launch'"); + } + + // Expect 'with' + Token with_tok = lexer_peek(l); + if (with_tok.type != TOK_IDENT || strncmp(with_tok.start, "with", 4) != 0 || + with_tok.len != 4) + { + zpanic_at(with_tok, "Expected 'with' after kernel call in launch statement"); + } + lexer_next(l); // eat 'with' + + // Expect '{' for configuration block + if (lexer_peek(l).type != TOK_LBRACE) + { + zpanic_at(lexer_peek(l), "Expected '{' after 'with' in launch statement"); + } + lexer_next(l); // eat '{' + + ASTNode *grid = NULL; + ASTNode *block = NULL; + ASTNode *shared_mem = NULL; + ASTNode *stream = NULL; + + // Parse configuration fields + while (lexer_peek(l).type != TOK_RBRACE && lexer_peek(l).type != TOK_EOF) + { + Token field_name = lexer_next(l); + if (field_name.type != TOK_IDENT) + { + zpanic_at(field_name, "Expected field name in launch configuration"); + } + + // Expect ':' + if (lexer_peek(l).type != TOK_COLON) + { + zpanic_at(lexer_peek(l), "Expected ':' after field name"); + } + lexer_next(l); // eat ':' + + // Parse value expression + ASTNode *value = parse_expression(ctx, l); + + // Assign to appropriate field + if (strncmp(field_name.start, "grid", 4) == 0 && field_name.len == 4) + { + grid = value; + } + else if (strncmp(field_name.start, "block", 5) == 0 && field_name.len == 5) + { + block = value; + } + else if (strncmp(field_name.start, "shared_mem", 10) == 0 && field_name.len == 10) + { + shared_mem = value; + } + else if (strncmp(field_name.start, "stream", 6) == 0 && field_name.len == 6) + { + stream = value; + } + else + { + zpanic_at(field_name, "Unknown launch configuration field (expected: grid, " + "block, shared_mem, stream)"); + } + + // Optional comma + if (lexer_peek(l).type == TOK_COMMA) + { + lexer_next(l); + } + } + + // Expect '}' + if (lexer_peek(l).type != TOK_RBRACE) + { + zpanic_at(lexer_peek(l), "Expected '}' to close launch configuration"); + } + lexer_next(l); // eat '}' + + // Expect ';' + if (lexer_peek(l).type == TOK_SEMICOLON) + { + lexer_next(l); + } + + // Require at least grid and block + if (!grid || !block) + { + zpanic_at(launch_tok, "Launch configuration requires at least 'grid' and 'block'"); + } + + ASTNode *n = ast_create(NODE_CUDA_LAUNCH); + n->cuda_launch.call = call; + n->cuda_launch.grid = grid; + n->cuda_launch.block = block; + n->cuda_launch.shared_mem = shared_mem; + n->cuda_launch.stream = stream; + n->token = launch_tok; + return n; + } + // Do-while loop: do { body } while condition; if (strncmp(tk.start, "do", 2) == 0 && tk.len == 2) { diff --git a/src/zprep.h b/src/zprep.h index f9bb6b6..18c4c51 100644 --- a/src/zprep.h +++ b/src/zprep.h @@ -182,6 +182,7 @@ typedef struct int is_freestanding; // 1 if --freestanding. int mode_transpile; // 1 if 'transpile' command. int use_cpp; // 1 if --cpp (emit C++ compatible code). + int use_cuda; // 1 if --cuda (emit CUDA-compatible code). // GCC Flags accumulator. char gcc_flags[4096]; diff --git a/std/cuda.zc b/std/cuda.zc new file mode 100644 index 0000000..c010302 --- /dev/null +++ b/std/cuda.zc @@ -0,0 +1,113 @@ + +include <cuda_runtime.h> + +// Memory Management + +/// Allocate device memory for n elements of type T. +fn cuda_alloc<T>(n: usize) -> T* { + var ptr: T* = NULL; + cudaMalloc((void**)&ptr, n * sizeof(T)); + return ptr; +} + +/// Free device memory. +fn cuda_free(ptr: void*) { + cudaFree(ptr); +} + +/// Copy bytes from host to device. +fn cuda_copy_to_device(dst: void*, src: void*, bytes: usize) { + cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); +} + +/// Copy bytes from device to host. +fn cuda_copy_to_host(dst: void*, src: void*, bytes: usize) { + cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); +} + +/// Copy bytes between device buffers. +fn cuda_copy_device(dst: void*, src: void*, bytes: usize) { + cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice); +} + +/// Set device memory to zero. +fn cuda_zero(ptr: void*, bytes: usize) { + cudaMemset(ptr, 0, bytes); +} + +// Synchronization + +/// Synchronize the device (wait for all kernels to complete). +fn cuda_sync() { + cudaDeviceSynchronize(); +} + +// Thread Indexing (for use inside @global/@device functions) + +/// Get 1D global thread index (blockIdx.x * blockDim.x + threadIdx.x). +@device @inline +fn thread_id() -> int { + var id: int; + id = blockIdx.x * blockDim.x + threadIdx.x; + return id; +} + +/// Get 1D block index. +@device @inline +fn block_id() -> int { + var id: int; + id = blockIdx.x; + return id; +} + +/// Get 1D thread index within block. +@device @inline +fn local_id() -> int { + var id: int; + id = threadIdx.x; + return id; +} + +/// Get block size (number of threads per block). +@device @inline +fn block_size() -> int { + var size: int; + size = blockDim.x; + return size; +} + +/// Get grid size (number of blocks). +@device @inline +fn grid_size() -> int { + var size: int; + size = gridDim.x; + return size; +} + +// Device Info + +/// Get the number of CUDA devices. +fn cuda_device_count() -> int { + var count: int = 0; + cudaGetDeviceCount(&count); + return count; +} + +/// Set the active CUDA device. +fn cuda_set_device(id: int) { + cudaSetDevice(id); +} + +// Error Handling + +/// Get the last CUDA error code. +fn cuda_last_error() -> int { + var err: int; + err = (int)cudaGetLastError(); + return err; +} + +/// Check if last CUDA operation succeeded. +fn cuda_ok() -> bool { + return cuda_last_error() == 0; +} @@ -1,14 +1,14 @@ fn alloc<T>() -> T* { - return malloc(sizeof(T)); + return (T*)malloc(sizeof(T)); } fn zalloc<T>() -> T* { - return calloc(1, sizeof(T)); + return (T*)calloc(1, sizeof(T)); } fn alloc_n<T>(n: usize) -> T* { - return malloc(sizeof(T) * n); + return (T*)malloc(sizeof(T) * n); } struct Box<T> { |
