summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md90
-rw-r--r--examples/gpu/cuda_vector_add.zc73
-rw-r--r--src/ast/ast.h16
-rw-r--r--src/codegen/codegen.c64
-rw-r--r--src/codegen/codegen_utils.c17
-rw-r--r--src/main.c7
-rw-r--r--src/parser/parser_core.c24
-rw-r--r--src/parser/parser_stmt.c111
-rw-r--r--src/zprep.h1
-rw-r--r--std/cuda.zc113
-rw-r--r--std/mem.zc6
11 files changed, 517 insertions, 5 deletions
diff --git a/README.md b/README.md
index 8a7629b..69eee0a 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,10 @@ Join the discussion, share demos, ask questions, or report bugs in the official
- [14. Inline Assembly](#14-inline-assembly)
- [15. Build Directives](#15-build-directives)
- [Compiler Support & Compatibility](#compiler-support--compatibility)
+ - [Test Suite Status](#test-suite-status)
+ - [Building with Zig](#building-with-zig)
+ - [C++ Interop](#c-interop)
+ - [CUDA Interop](#cuda-interop)
- [Contributing](#contributing)
---
@@ -679,6 +683,92 @@ fn main() {
> **Note:** The `--cpp` flag switches the backend to `g++` and emits C++-compatible code (uses `auto` instead of `__auto_type`, function overloads instead of `_Generic`, and explicit casts for `void*`).
+### CUDA Interop
+
+Zen C can generate CUDA-compatible code with the `--cuda` flag, allowing you to write GPU kernels using Zen C syntax.
+
+```bash
+# Direct compilation with nvcc
+zc run app.zc --cuda
+
+# Or transpile for manual build
+zc transpile app.zc --cuda -o app.cu
+nvcc app.cu -o app
+```
+
+#### CUDA-Specific Attributes
+
+| Attribute | CUDA Equivalent | Description |
+|:---|:---|:---|
+| `@global` | `__global__` | Kernel function (runs on GPU, called from host) |
+| `@device` | `__device__` | Device function (runs on GPU, called from GPU) |
+| `@host` | `__host__` | Host function (explicit CPU-only) |
+
+#### Kernel Launch Syntax
+
+Zen C provides a clean `launch` statement for invoking CUDA kernels:
+
+```zc
+launch kernel_name(args) with {
+ grid: num_blocks,
+ block: threads_per_block,
+ shared_mem: 1024, // Optional
+ stream: my_stream // Optional
+};
+```
+
+This transpiles to: `kernel_name<<<grid, block, shared, stream>>>(args);`
+
+#### Writing CUDA Kernels
+
+Use Zen C function syntax with `@global` and the `launch` statement:
+
+```zc
+include <cuda_runtime.h>
+import "std/cuda.zc"
+
+@global
+fn add_kernel(a: float*, b: float*, c: float*, n: int) {
+ // You can use raw CUDA C or the std/cuda.zc helpers
+ var i = thread_id();
+ if i < n {
+ c[i] = a[i] + b[i];
+ }
+}
+
+fn main() {
+ // ... allocation ...
+ launch add_kernel(d_a, d_b, d_c, N) with {
+ grid: num_blocks,
+ block: 256
+ };
+ // ... cleanup ...
+}
+```
+
+#### Standard Library (`std/cuda.zc`)
+Zen C provides a standard library for common CUDA operations to reduce `raw` blocks:
+
+```zc
+import "std/cuda.zc"
+
+// Memory management
+var d_ptr = cuda_alloc<float>(1024);
+cuda_copy_to_device(d_ptr, h_ptr, 1024 * sizeof(float));
+defer cuda_free(d_ptr);
+
+// Synchronization
+cuda_sync();
+
+// Thread Indexing (use inside kernels)
+var i = thread_id(); // Global index
+var bid = block_id();
+var tid = local_id();
+```
+
+
+> **Note:** The `--cuda` flag sets `nvcc` as the compiler and implies `--cpp` mode. Requires the NVIDIA CUDA Toolkit.
+
---
## Contributing
diff --git a/examples/gpu/cuda_vector_add.zc b/examples/gpu/cuda_vector_add.zc
new file mode 100644
index 0000000..de75a74
--- /dev/null
+++ b/examples/gpu/cuda_vector_add.zc
@@ -0,0 +1,73 @@
+
+// Compile with: zc run cuda_vector_add.zc --cuda
+
+//> cflags: -arch=sm_75
+
+import "std/cuda.zc"
+import "std/mem.zc"
+
+@global
+fn add_kernel(a: float*, b: float*, c: float*, n: int) {
+ var i = thread_id();
+ if i < n {
+ c[i] = a[i] + b[i];
+ }
+}
+
+fn main() {
+ const N = 1024;
+
+ "=> Zen C CUDA Vector Addition";
+ "-> Vector size: {N} elements";
+
+ var h_a = alloc_n<float>(N);
+ var h_b = alloc_n<float>(N);
+ var h_c = alloc_n<float>(N);
+ defer free(h_a);
+ defer free(h_b);
+ defer free(h_c);
+
+ for i in 0..N {
+ h_a[i] = (float)i;
+ h_b[i] = (float)(i * 2);
+ }
+
+ "-> Allocating device memory...";
+ var d_a = cuda_alloc<float>(N);
+ var d_b = cuda_alloc<float>(N);
+ var d_c = cuda_alloc<float>(N);
+ defer cuda_free(d_a);
+ defer cuda_free(d_b);
+ defer cuda_free(d_c);
+
+ cuda_copy_to_device(d_a, h_a, N * sizeof(float));
+ cuda_copy_to_device(d_b, h_b, N * sizeof(float));
+
+ const BLOCK_SIZE = 256;
+ var num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+ "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads";
+
+ launch add_kernel(d_a, d_b, d_c, N) with {
+ grid: num_blocks,
+ block: BLOCK_SIZE
+ };
+
+ cuda_sync();
+
+ cuda_copy_to_host(h_c, d_c, N * sizeof(float));
+
+ "-> Verifying...";
+ var ok: int = 1;
+ for i in 0..10 {
+ var expected = h_a[i] + h_b[i];
+ if h_c[i] != expected {
+ !"-> Mismatch at {i}";
+ ok = 0;
+ }
+ }
+
+ if ok {
+ "-> All checks passed!";
+ }
+}
diff --git a/src/ast/ast.h b/src/ast/ast.h
index 2288860..cef68c6 100644
--- a/src/ast/ast.h
+++ b/src/ast/ast.h
@@ -122,7 +122,8 @@ typedef enum
NODE_TRY,
NODE_REFLECTION,
NODE_AWAIT,
- NODE_REPL_PRINT
+ NODE_REPL_PRINT,
+ NODE_CUDA_LAUNCH
} NodeType;
// ** AST Node Structure **
@@ -176,6 +177,10 @@ struct ASTNode
char *section; // @section("name")
int is_async; // async function
int is_comptime; // @comptime function
+ // CUDA qualifiers
+ int cuda_global; // @global -> __global__
+ int cuda_device; // @device -> __device__
+ int cuda_host; // @host -> __host__
} func;
struct
@@ -539,6 +544,15 @@ struct ASTNode
{
ASTNode *expr;
} repl_print;
+
+ struct
+ {
+ ASTNode *call; // The kernel call (NODE_EXPR_CALL)
+ ASTNode *grid; // Grid dimensions expression
+ ASTNode *block; // Block dimensions expression
+ ASTNode *shared_mem; // Optional shared memory size (NULL = default)
+ ASTNode *stream; // Optional CUDA stream (NULL = default)
+ } cuda_launch;
};
};
diff --git a/src/codegen/codegen.c b/src/codegen/codegen.c
index 644e9f5..a371548 100644
--- a/src/codegen/codegen.c
+++ b/src/codegen/codegen.c
@@ -2252,6 +2252,70 @@ void codegen_node_single(ParserContext *ctx, ASTNode *node, FILE *out)
fprintf(out, ";\n");
}
break;
+ case NODE_CUDA_LAUNCH:
+ {
+ // Emit CUDA kernel launch: kernel<<<grid, block, shared, stream>>>(args);
+ ASTNode *call = node->cuda_launch.call;
+
+ // Get kernel name from callee
+ if (call->call.callee->type == NODE_EXPR_VAR)
+ {
+ fprintf(out, " %s<<<", call->call.callee->var_ref.name);
+ }
+ else
+ {
+ fprintf(out, " ");
+ codegen_expression(ctx, call->call.callee, out);
+ fprintf(out, "<<<");
+ }
+
+ // Grid dimension
+ codegen_expression(ctx, node->cuda_launch.grid, out);
+ fprintf(out, ", ");
+
+ // Block dimension
+ codegen_expression(ctx, node->cuda_launch.block, out);
+
+ // Optional shared memory size
+ if (node->cuda_launch.shared_mem || node->cuda_launch.stream)
+ {
+ fprintf(out, ", ");
+ if (node->cuda_launch.shared_mem)
+ {
+ codegen_expression(ctx, node->cuda_launch.shared_mem, out);
+ }
+ else
+ {
+ fprintf(out, "0");
+ }
+ }
+
+ // Optional CUDA stream
+ if (node->cuda_launch.stream)
+ {
+ fprintf(out, ", ");
+ codegen_expression(ctx, node->cuda_launch.stream, out);
+ }
+
+ fprintf(out, ">>>(");
+
+ // Arguments
+ ASTNode *arg = call->call.args;
+ int first = 1;
+ while (arg)
+ {
+ if (!first)
+ {
+ fprintf(out, ", ");
+ }
+ codegen_expression(ctx, arg, out);
+ first = 0;
+ arg = arg->next;
+ }
+
+ fprintf(out, ");\n");
+ break;
+ }
default:
codegen_expression(ctx, node, out);
fprintf(out, ";\n");
diff --git a/src/codegen/codegen_utils.c b/src/codegen/codegen_utils.c
index b1fcf4c..af1c862 100644
--- a/src/codegen/codegen_utils.c
+++ b/src/codegen/codegen_utils.c
@@ -535,6 +535,23 @@ void emit_func_signature(FILE *out, ASTNode *func, const char *name_override)
return;
}
+ // Emit CUDA qualifiers (for both forward declarations and definitions)
+ if (g_config.use_cuda)
+ {
+ if (func->func.cuda_global)
+ {
+ fprintf(out, "__global__ ");
+ }
+ if (func->func.cuda_device)
+ {
+ fprintf(out, "__device__ ");
+ }
+ if (func->func.cuda_host)
+ {
+ fprintf(out, "__host__ ");
+ }
+ }
+
// Return type
char *ret_str;
if (func->func.ret_type_info)
diff --git a/src/main.c b/src/main.c
index edc3723..2b7dcf0 100644
--- a/src/main.c
+++ b/src/main.c
@@ -48,6 +48,7 @@ void print_usage()
printf(" -q, --quiet Quiet output\n");
printf(" -c Compile only (produce .o)\n");
printf(" --cpp Use C++ mode.\n");
+ printf(" --cuda Use CUDA mode (requires nvcc).\n");
}
int main(int argc, char **argv)
@@ -147,6 +148,12 @@ int main(int argc, char **argv)
strcpy(g_config.cc, "g++");
g_config.use_cpp = 1;
}
+ else if (strcmp(arg, "--cuda") == 0)
+ {
+ strcpy(g_config.cc, "nvcc");
+ g_config.use_cuda = 1;
+ g_config.use_cpp = 1; // CUDA implies C++ mode
+ }
else if (strcmp(arg, "--check") == 0)
{
g_config.mode_check = 1;
diff --git a/src/parser/parser_core.c b/src/parser/parser_core.c
index 3e683fb..c3c91fe 100644
--- a/src/parser/parser_core.c
+++ b/src/parser/parser_core.c
@@ -63,6 +63,9 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l)
int attr_weak = 0;
int attr_export = 0;
int attr_comptime = 0;
+ int attr_cuda_global = 0; // @global -> __global__
+ int attr_cuda_device = 0; // @device -> __device__
+ int attr_cuda_host = 0; // @host -> __host__
char *deprecated_msg = NULL;
char *attr_section = NULL;
@@ -232,7 +235,23 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l)
}
else
{
- zwarn_at(attr, "Unknown attribute: %.*s", attr.len, attr.start);
+ // Checking for CUDA attributes...
+ if (0 == strncmp(attr.start, "global", 6) && 6 == attr.len)
+ {
+ attr_cuda_global = 1;
+ }
+ else if (0 == strncmp(attr.start, "device", 6) && 6 == attr.len)
+ {
+ attr_cuda_device = 1;
+ }
+ else if (0 == strncmp(attr.start, "host", 4) && 4 == attr.len)
+ {
+ attr_cuda_host = 1;
+ }
+ else
+ {
+ zwarn_at(attr, "Unknown attribute: %.*s", attr.len, attr.start);
+ }
}
t = lexer_peek(l);
@@ -469,6 +488,9 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l)
s->func.pure = attr_pure;
s->func.section = attr_section;
s->func.is_comptime = attr_comptime;
+ s->func.cuda_global = attr_cuda_global;
+ s->func.cuda_device = attr_cuda_device;
+ s->func.cuda_host = attr_cuda_host;
if (attr_deprecated && s->func.name)
{
diff --git a/src/parser/parser_stmt.c b/src/parser/parser_stmt.c
index 5307768..daf3f72 100644
--- a/src/parser/parser_stmt.c
+++ b/src/parser/parser_stmt.c
@@ -2503,6 +2503,117 @@ ASTNode *parse_statement(ParserContext *ctx, Lexer *l)
return parse_guard(ctx, l);
}
+ // CUDA launch: launch kernel(args) with { grid: X, block: Y };
+ if (strncmp(tk.start, "launch", 6) == 0 && tk.len == 6)
+ {
+ Token launch_tok = lexer_next(l); // eat 'launch'
+
+ // Parse the kernel call expression
+ ASTNode *call = parse_expression(ctx, l);
+ if (!call || call->type != NODE_EXPR_CALL)
+ {
+ zpanic_at(launch_tok, "Expected kernel call after 'launch'");
+ }
+
+ // Expect 'with'
+ Token with_tok = lexer_peek(l);
+ if (with_tok.type != TOK_IDENT || strncmp(with_tok.start, "with", 4) != 0 ||
+ with_tok.len != 4)
+ {
+ zpanic_at(with_tok, "Expected 'with' after kernel call in launch statement");
+ }
+ lexer_next(l); // eat 'with'
+
+ // Expect '{' for configuration block
+ if (lexer_peek(l).type != TOK_LBRACE)
+ {
+ zpanic_at(lexer_peek(l), "Expected '{' after 'with' in launch statement");
+ }
+ lexer_next(l); // eat '{'
+
+ ASTNode *grid = NULL;
+ ASTNode *block = NULL;
+ ASTNode *shared_mem = NULL;
+ ASTNode *stream = NULL;
+
+ // Parse configuration fields
+ while (lexer_peek(l).type != TOK_RBRACE && lexer_peek(l).type != TOK_EOF)
+ {
+ Token field_name = lexer_next(l);
+ if (field_name.type != TOK_IDENT)
+ {
+ zpanic_at(field_name, "Expected field name in launch configuration");
+ }
+
+ // Expect ':'
+ if (lexer_peek(l).type != TOK_COLON)
+ {
+ zpanic_at(lexer_peek(l), "Expected ':' after field name");
+ }
+ lexer_next(l); // eat ':'
+
+ // Parse value expression
+ ASTNode *value = parse_expression(ctx, l);
+
+ // Assign to appropriate field
+ if (strncmp(field_name.start, "grid", 4) == 0 && field_name.len == 4)
+ {
+ grid = value;
+ }
+ else if (strncmp(field_name.start, "block", 5) == 0 && field_name.len == 5)
+ {
+ block = value;
+ }
+ else if (strncmp(field_name.start, "shared_mem", 10) == 0 && field_name.len == 10)
+ {
+ shared_mem = value;
+ }
+ else if (strncmp(field_name.start, "stream", 6) == 0 && field_name.len == 6)
+ {
+ stream = value;
+ }
+ else
+ {
+ zpanic_at(field_name, "Unknown launch configuration field (expected: grid, "
+ "block, shared_mem, stream)");
+ }
+
+ // Optional comma
+ if (lexer_peek(l).type == TOK_COMMA)
+ {
+ lexer_next(l);
+ }
+ }
+
+ // Expect '}'
+ if (lexer_peek(l).type != TOK_RBRACE)
+ {
+ zpanic_at(lexer_peek(l), "Expected '}' to close launch configuration");
+ }
+ lexer_next(l); // eat '}'
+
+ // Expect ';'
+ if (lexer_peek(l).type == TOK_SEMICOLON)
+ {
+ lexer_next(l);
+ }
+
+ // Require at least grid and block
+ if (!grid || !block)
+ {
+ zpanic_at(launch_tok, "Launch configuration requires at least 'grid' and 'block'");
+ }
+
+ ASTNode *n = ast_create(NODE_CUDA_LAUNCH);
+ n->cuda_launch.call = call;
+ n->cuda_launch.grid = grid;
+ n->cuda_launch.block = block;
+ n->cuda_launch.shared_mem = shared_mem;
+ n->cuda_launch.stream = stream;
+ n->token = launch_tok;
+ return n;
+ }
+
// Do-while loop: do { body } while condition;
if (strncmp(tk.start, "do", 2) == 0 && tk.len == 2)
{
diff --git a/src/zprep.h b/src/zprep.h
index f9bb6b6..18c4c51 100644
--- a/src/zprep.h
+++ b/src/zprep.h
@@ -182,6 +182,7 @@ typedef struct
int is_freestanding; // 1 if --freestanding.
int mode_transpile; // 1 if 'transpile' command.
int use_cpp; // 1 if --cpp (emit C++ compatible code).
+ int use_cuda; // 1 if --cuda (emit CUDA-compatible code).
// GCC Flags accumulator.
char gcc_flags[4096];
diff --git a/std/cuda.zc b/std/cuda.zc
new file mode 100644
index 0000000..c010302
--- /dev/null
+++ b/std/cuda.zc
@@ -0,0 +1,113 @@
+
+include <cuda_runtime.h>
+
+// Memory Management
+
+/// Allocate device memory for n elements of type T.
+fn cuda_alloc<T>(n: usize) -> T* {
+ var ptr: T* = NULL;
+ cudaMalloc((void**)&ptr, n * sizeof(T));
+ return ptr;
+}
+
+/// Free device memory.
+fn cuda_free(ptr: void*) {
+ cudaFree(ptr);
+}
+
+/// Copy bytes from host to device.
+fn cuda_copy_to_device(dst: void*, src: void*, bytes: usize) {
+ cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
+}
+
+/// Copy bytes from device to host.
+fn cuda_copy_to_host(dst: void*, src: void*, bytes: usize) {
+ cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
+}
+
+/// Copy bytes between device buffers.
+fn cuda_copy_device(dst: void*, src: void*, bytes: usize) {
+ cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice);
+}
+
+/// Set device memory to zero.
+fn cuda_zero(ptr: void*, bytes: usize) {
+ cudaMemset(ptr, 0, bytes);
+}
+
+// Synchronization
+
+/// Synchronize the device (wait for all kernels to complete).
+fn cuda_sync() {
+ cudaDeviceSynchronize();
+}
+
+// Thread Indexing (for use inside @global/@device functions)
+
+/// Get 1D global thread index (blockIdx.x * blockDim.x + threadIdx.x).
+@device @inline
+fn thread_id() -> int {
+ var id: int;
+ id = blockIdx.x * blockDim.x + threadIdx.x;
+ return id;
+}
+
+/// Get 1D block index.
+@device @inline
+fn block_id() -> int {
+ var id: int;
+ id = blockIdx.x;
+ return id;
+}
+
+/// Get 1D thread index within block.
+@device @inline
+fn local_id() -> int {
+ var id: int;
+ id = threadIdx.x;
+ return id;
+}
+
+/// Get block size (number of threads per block).
+@device @inline
+fn block_size() -> int {
+ var size: int;
+ size = blockDim.x;
+ return size;
+}
+
+/// Get grid size (number of blocks).
+@device @inline
+fn grid_size() -> int {
+ var size: int;
+ size = gridDim.x;
+ return size;
+}
+
+// Device Info
+
+/// Get the number of CUDA devices.
+fn cuda_device_count() -> int {
+ var count: int = 0;
+ cudaGetDeviceCount(&count);
+ return count;
+}
+
+/// Set the active CUDA device.
+fn cuda_set_device(id: int) {
+ cudaSetDevice(id);
+}
+
+// Error Handling
+
+/// Get the last CUDA error code.
+fn cuda_last_error() -> int {
+ var err: int;
+ err = (int)cudaGetLastError();
+ return err;
+}
+
+/// Check if last CUDA operation succeeded.
+fn cuda_ok() -> bool {
+ return cuda_last_error() == 0;
+}
diff --git a/std/mem.zc b/std/mem.zc
index 3e08e8b..45bc208 100644
--- a/std/mem.zc
+++ b/std/mem.zc
@@ -1,14 +1,14 @@
fn alloc<T>() -> T* {
- return malloc(sizeof(T));
+ return (T*)malloc(sizeof(T));
}
fn zalloc<T>() -> T* {
- return calloc(1, sizeof(T));
+ return (T*)calloc(1, sizeof(T));
}
fn alloc_n<T>(n: usize) -> T* {
- return malloc(sizeof(T) * n);
+ return (T*)malloc(sizeof(T) * n);
}
struct Box<T> {