11 files changed, 517 insertions, 5 deletions
diff --git a/README.md b/README.md
index 8a7629b..69eee0a 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,10 @@ Join the discussion, share demos, ask questions, or report bugs in the official
     - [14. Inline Assembly](#14-inline-assembly)
     - [15. Build Directives](#15-build-directives)
 - [Compiler Support & Compatibility](#compiler-support--compatibility)
+    - [Test Suite Status](#test-suite-status)
+    - [Building with Zig](#building-with-zig)
+    - [C++ Interop](#c-interop)
+    - [CUDA Interop](#cuda-interop)
 - [Contributing](#contributing)
 
 ---
@@ -679,6 +683,92 @@ fn main() {
 
 > **Note:** The `--cpp` flag switches the backend to `g++` and emits C++-compatible code (uses `auto` instead of `__auto_type`, function overloads instead of `_Generic`, and explicit casts for `void*`).
 
+### CUDA Interop
+
+Zen C can generate CUDA-compatible code with the `--cuda` flag, allowing you to write GPU kernels using Zen C syntax.
+
+```bash
+# Direct compilation with nvcc
+zc run app.zc --cuda
+
+# Or transpile for manual build
+zc transpile app.zc --cuda -o app.cu
+nvcc app.cu -o app
+```
+
+#### CUDA-Specific Attributes
+
+| Attribute | CUDA Equivalent | Description |
+|:---|:---|:---|
+| `@global` | `__global__` | Kernel function (runs on GPU, called from host) |
+| `@device` | `__device__` | Device function (runs on GPU, called from GPU) |
+| `@host` | `__host__` | Host function (explicit CPU-only) |
+
+#### Kernel Launch Syntax
+
+Zen C provides a clean `launch` statement for invoking CUDA kernels:
+
+```zc
+launch kernel_name(args) with {
+    grid: num_blocks,
+    block: threads_per_block,
+    shared_mem: 1024,  // Optional
+    stream: my_stream   // Optional
+};
+```
+
+This transpiles to: `kernel_name<<<grid, block, shared, stream>>>(args);`
+
+#### Writing CUDA Kernels
+
+Use Zen C function syntax with `@global` and the `launch` statement:
+
+```zc
+include <cuda_runtime.h>
+import "std/cuda.zc"
+
+@global
+fn add_kernel(a: float*, b: float*, c: float*, n: int) {
+    // You can use raw CUDA C or the std/cuda.zc helpers
+    var i = thread_id();
+    if i < n {
+        c[i] = a[i] + b[i];
+    }
+}
+
+fn main() {
+    // ... allocation ...
+    launch add_kernel(d_a, d_b, d_c, N) with {
+        grid: num_blocks,
+        block: 256
+    };
+    // ... cleanup ...
+}
+```
+
+#### Standard Library (`std/cuda.zc`)
+Zen C provides a standard library for common CUDA operations to reduce `raw` blocks:
+
+```zc
+import "std/cuda.zc"
+
+// Memory management
+var d_ptr = cuda_alloc<float>(1024);
+cuda_copy_to_device(d_ptr, h_ptr, 1024 * sizeof(float));
+defer cuda_free(d_ptr);
+
+// Synchronization
+cuda_sync();
+
+// Thread Indexing (use inside kernels)
+var i = thread_id(); // Global index
+var bid = block_id();
+var tid = local_id();
+```
+
+
+> **Note:** The `--cuda` flag sets `nvcc` as the compiler and implies `--cpp` mode. Requires the NVIDIA CUDA Toolkit.
+
 ---
 
 ## Contributing
diff --git a/examples/gpu/cuda_vector_add.zc b/examples/gpu/cuda_vector_add.zc
new file mode 100644
index 0000000..de75a74
--- /dev/null
+++ b/examples/gpu/cuda_vector_add.zc
@@ -0,0 +1,73 @@
+
+// Compile with: zc run cuda_vector_add.zc --cuda
+
+//> cflags: -arch=sm_75
+
+import "std/cuda.zc"
+import "std/mem.zc"
+
+@global
+fn add_kernel(a: float*, b: float*, c: float*, n: int) {
+    var i = thread_id();
+    if i < n {
+        c[i] = a[i] + b[i];
+    }
+}
+
+fn main() {
+    const N = 1024;
+    
+    "=> Zen C CUDA Vector Addition";
+    "-> Vector size: {N} elements";
+    
+    var h_a = alloc_n<float>(N);
+    var h_b = alloc_n<float>(N);
+    var h_c = alloc_n<float>(N);
+    defer free(h_a);
+    defer free(h_b);
+    defer free(h_c);
+    
+    for i in 0..N {
+        h_a[i] = (float)i;
+        h_b[i] = (float)(i * 2);
+    }
+    
+    "-> Allocating device memory...";
+    var d_a = cuda_alloc<float>(N);
+    var d_b = cuda_alloc<float>(N);
+    var d_c = cuda_alloc<float>(N);
+    defer cuda_free(d_a);
+    defer cuda_free(d_b);
+    defer cuda_free(d_c);
+    
+    cuda_copy_to_device(d_a, h_a, N * sizeof(float));
+    cuda_copy_to_device(d_b, h_b, N * sizeof(float));
+    
+    const BLOCK_SIZE = 256;
+    var num_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    
+    "-> Launching: {num_blocks} blocks x {BLOCK_SIZE} threads";
+    
+    launch add_kernel(d_a, d_b, d_c, N) with {
+        grid: num_blocks,
+        block: BLOCK_SIZE
+    };
+    
+    cuda_sync();
+    
+    cuda_copy_to_host(h_c, d_c, N * sizeof(float));
+    
+    "-> Verifying...";
+    var ok: int = 1;
+    for i in 0..10 {
+        var expected = h_a[i] + h_b[i];
+        if h_c[i] != expected {
+            !"-> Mismatch at {i}";
+            ok = 0;
+        }
+    }
+    
+    if ok {
+        "-> All checks passed!";
+    }
+}
diff --git a/src/ast/ast.h b/src/ast/ast.h
index 2288860..cef68c6 100644
--- a/src/ast/ast.h
+++ b/src/ast/ast.h
@@ -122,7 +122,8 @@ typedef enum
     NODE_TRY,
     NODE_REFLECTION,
     NODE_AWAIT,
-    NODE_REPL_PRINT
+    NODE_REPL_PRINT,
+    NODE_CUDA_LAUNCH
 } NodeType;
 
 // ** AST Node Structure **
@@ -176,6 +177,10 @@ struct ASTNode
             char *section;   // @section("name")
             int is_async;    // async function
             int is_comptime; // @comptime function
+            // CUDA qualifiers
+            int cuda_global; // @global -> __global__
+            int cuda_device; // @device -> __device__
+            int cuda_host;   // @host -> __host__
         } func;
 
         struct
@@ -539,6 +544,15 @@ struct ASTNode
         {
             ASTNode *expr;
         } repl_print;
+
+        struct
+        {
+            ASTNode *call;       // The kernel call (NODE_EXPR_CALL)
+            ASTNode *grid;       // Grid dimensions expression
+            ASTNode *block;      // Block dimensions expression
+            ASTNode *shared_mem; // Optional shared memory size (NULL = default)
+            ASTNode *stream;     // Optional CUDA stream (NULL = default)
+        } cuda_launch;
     };
 };
 
diff --git a/src/codegen/codegen.c b/src/codegen/codegen.c
index 644e9f5..a371548 100644
--- a/src/codegen/codegen.c
+++ b/src/codegen/codegen.c
@@ -2252,6 +2252,70 @@ void codegen_node_single(ParserContext *ctx, ASTNode *node, FILE *out)
             fprintf(out, ";\n");
         }
         break;
+    case NODE_CUDA_LAUNCH:
+    {
+        // Emit CUDA kernel launch: kernel<<<grid, block, shared, stream>>>(args);
+        ASTNode *call = node->cuda_launch.call;
+
+        // Get kernel name from callee
+        if (call->call.callee->type == NODE_EXPR_VAR)
+        {
+            fprintf(out, "    %s<<<", call->call.callee->var_ref.name);
+        }
+        else
+        {
+            fprintf(out, "    ");
+            codegen_expression(ctx, call->call.callee, out);
+            fprintf(out, "<<<");
+        }
+
+        // Grid dimension
+        codegen_expression(ctx, node->cuda_launch.grid, out);
+        fprintf(out, ", ");
+
+        // Block dimension
+        codegen_expression(ctx, node->cuda_launch.block, out);
+
+        // Optional shared memory size
+        if (node->cuda_launch.shared_mem || node->cuda_launch.stream)
+        {
+            fprintf(out, ", ");
+            if (node->cuda_launch.shared_mem)
+            {
+                codegen_expression(ctx, node->cuda_launch.shared_mem, out);
+            }
+            else
+            {
+                fprintf(out, "0");
+            }
+        }
+
+        // Optional CUDA stream
+        if (node->cuda_launch.stream)
+        {
+            fprintf(out, ", ");
+            codegen_expression(ctx, node->cuda_launch.stream, out);
+        }
+
+        fprintf(out, ">>>(");
+
+        // Arguments
+        ASTNode *arg = call->call.args;
+        int first = 1;
+        while (arg)
+        {
+            if (!first)
+            {
+                fprintf(out, ", ");
+            }
+            codegen_expression(ctx, arg, out);
+            first = 0;
+            arg = arg->next;
+        }
+
+        fprintf(out, ");\n");
+        break;
+    }
     default:
         codegen_expression(ctx, node, out);
         fprintf(out, ";\n");
diff --git a/src/codegen/codegen_utils.c b/src/codegen/codegen_utils.c
index b1fcf4c..af1c862 100644
--- a/src/codegen/codegen_utils.c
+++ b/src/codegen/codegen_utils.c
@@ -535,6 +535,23 @@ void emit_func_signature(FILE *out, ASTNode *func, const char *name_override)
         return;
     }
 
+    // Emit CUDA qualifiers (for both forward declarations and definitions)
+    if (g_config.use_cuda)
+    {
+        if (func->func.cuda_global)
+        {
+            fprintf(out, "__global__ ");
+        }
+        if (func->func.cuda_device)
+        {
+            fprintf(out, "__device__ ");
+        }
+        if (func->func.cuda_host)
+        {
+            fprintf(out, "__host__ ");
+        }
+    }
+
     // Return type
     char *ret_str;
     if (func->func.ret_type_info)
diff --git a/src/main.c b/src/main.c
index edc3723..2b7dcf0 100644
--- a/src/main.c
+++ b/src/main.c
@@ -48,6 +48,7 @@ void print_usage()
     printf("  -q, --quiet     Quiet output\n");
     printf("  -c              Compile only (produce .o)\n");
     printf("  --cpp           Use C++ mode.\n");
+    printf("  --cuda          Use CUDA mode (requires nvcc).\n");
 }
 
 int main(int argc, char **argv)
@@ -147,6 +148,12 @@ int main(int argc, char **argv)
             strcpy(g_config.cc, "g++");
             g_config.use_cpp = 1;
         }
+        else if (strcmp(arg, "--cuda") == 0)
+        {
+            strcpy(g_config.cc, "nvcc");
+            g_config.use_cuda = 1;
+            g_config.use_cpp = 1; // CUDA implies C++ mode
+        }
         else if (strcmp(arg, "--check") == 0)
         {
             g_config.mode_check = 1;
diff --git a/src/parser/parser_core.c b/src/parser/parser_core.c
index 3e683fb..c3c91fe 100644
--- a/src/parser/parser_core.c
+++ b/src/parser/parser_core.c
@@ -63,6 +63,9 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l)
         int attr_weak = 0;
         int attr_export = 0;
         int attr_comptime = 0;
+        int attr_cuda_global = 0; // @global -> __global__
+        int attr_cuda_device = 0; // @device -> __device__
+        int attr_cuda_host = 0;   // @host -> __host__
         char *deprecated_msg = NULL;
         char *attr_section = NULL;
 
@@ -232,7 +235,23 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l)
             }
             else
             {
-                zwarn_at(attr, "Unknown attribute: %.*s", attr.len, attr.start);
+                // Checking for CUDA attributes...
+                if (0 == strncmp(attr.start, "global", 6) && 6 == attr.len)
+                {
+                    attr_cuda_global = 1;
+                }
+                else if (0 == strncmp(attr.start, "device", 6) && 6 == attr.len)
+                {
+                    attr_cuda_device = 1;
+                }
+                else if (0 == strncmp(attr.start, "host", 4) && 4 == attr.len)
+                {
+                    attr_cuda_host = 1;
+                }
+                else
+                {
+                    zwarn_at(attr, "Unknown attribute: %.*s", attr.len, attr.start);
+                }
             }
 
             t = lexer_peek(l);
@@ -469,6 +488,9 @@ ASTNode *parse_program_nodes(ParserContext *ctx, Lexer *l)
             s->func.pure = attr_pure;
             s->func.section = attr_section;
             s->func.is_comptime = attr_comptime;
+            s->func.cuda_global = attr_cuda_global;
+            s->func.cuda_device = attr_cuda_device;
+            s->func.cuda_host = attr_cuda_host;
 
             if (attr_deprecated && s->func.name)
             {
diff --git a/src/parser/parser_stmt.c b/src/parser/parser_stmt.c
index 5307768..daf3f72 100644
--- a/src/parser/parser_stmt.c
+++ b/src/parser/parser_stmt.c
@@ -2503,6 +2503,117 @@ ASTNode *parse_statement(ParserContext *ctx, Lexer *l)
             return parse_guard(ctx, l);
         }
 
+        // CUDA launch: launch kernel(args) with { grid: X, block: Y };
+        if (strncmp(tk.start, "launch", 6) == 0 && tk.len == 6)
+        {
+            Token launch_tok = lexer_next(l); // eat 'launch'
+
+            // Parse the kernel call expression
+            ASTNode *call = parse_expression(ctx, l);
+            if (!call || call->type != NODE_EXPR_CALL)
+            {
+                zpanic_at(launch_tok, "Expected kernel call after 'launch'");
+            }
+
+            // Expect 'with'
+            Token with_tok = lexer_peek(l);
+            if (with_tok.type != TOK_IDENT || strncmp(with_tok.start, "with", 4) != 0 ||
+                with_tok.len != 4)
+            {
+                zpanic_at(with_tok, "Expected 'with' after kernel call in launch statement");
+            }
+            lexer_next(l); // eat 'with'
+
+            // Expect '{' for configuration block
+            if (lexer_peek(l).type != TOK_LBRACE)
+            {
+                zpanic_at(lexer_peek(l), "Expected '{' after 'with' in launch statement");
+            }
+            lexer_next(l); // eat '{'
+
+            ASTNode *grid = NULL;
+            ASTNode *block = NULL;
+            ASTNode *shared_mem = NULL;
+            ASTNode *stream = NULL;
+
+            // Parse configuration fields
+            while (lexer_peek(l).type != TOK_RBRACE && lexer_peek(l).type != TOK_EOF)
+            {
+                Token field_name = lexer_next(l);
+                if (field_name.type != TOK_IDENT)
+                {
+                    zpanic_at(field_name, "Expected field name in launch configuration");
+                }
+
+                // Expect ':'
+                if (lexer_peek(l).type != TOK_COLON)
+                {
+                    zpanic_at(lexer_peek(l), "Expected ':' after field name");
+                }
+                lexer_next(l); // eat ':'
+
+                // Parse value expression
+                ASTNode *value = parse_expression(ctx, l);
+
+                // Assign to appropriate field
+                if (strncmp(field_name.start, "grid", 4) == 0 && field_name.len == 4)
+                {
+                    grid = value;
+                }
+                else if (strncmp(field_name.start, "block", 5) == 0 && field_name.len == 5)
+                {
+                    block = value;
+                }
+                else if (strncmp(field_name.start, "shared_mem", 10) == 0 && field_name.len == 10)
+                {
+                    shared_mem = value;
+                }
+                else if (strncmp(field_name.start, "stream", 6) == 0 && field_name.len == 6)
+                {
+                    stream = value;
+                }
+                else
+                {
+                    zpanic_at(field_name, "Unknown launch configuration field (expected: grid, "
+                                          "block, shared_mem, stream)");
+                }
+
+                // Optional comma
+                if (lexer_peek(l).type == TOK_COMMA)
+                {
+                    lexer_next(l);
+                }
+            }
+
+            // Expect '}'
+            if (lexer_peek(l).type != TOK_RBRACE)
+            {
+                zpanic_at(lexer_peek(l), "Expected '}' to close launch configuration");
+            }
+            lexer_next(l); // eat '}'
+
+            // Expect ';'
+            if (lexer_peek(l).type == TOK_SEMICOLON)
+            {
+                lexer_next(l);
+            }
+
+            // Require at least grid and block
+            if (!grid || !block)
+            {
+                zpanic_at(launch_tok, "Launch configuration requires at least 'grid' and 'block'");
+            }
+
+            ASTNode *n = ast_create(NODE_CUDA_LAUNCH);
+            n->cuda_launch.call = call;
+            n->cuda_launch.grid = grid;
+            n->cuda_launch.block = block;
+            n->cuda_launch.shared_mem = shared_mem;
+            n->cuda_launch.stream = stream;
+            n->token = launch_tok;
+            return n;
+        }
+
         // Do-while loop: do { body } while condition;
         if (strncmp(tk.start, "do", 2) == 0 && tk.len == 2)
         {
diff --git a/src/zprep.h b/src/zprep.h
index f9bb6b6..18c4c51 100644
--- a/src/zprep.h
+++ b/src/zprep.h
@@ -182,6 +182,7 @@ typedef struct
     int is_freestanding; // 1 if --freestanding.
     int mode_transpile;  // 1 if 'transpile' command.
     int use_cpp;         // 1 if --cpp (emit C++ compatible code).
+    int use_cuda;        // 1 if --cuda (emit CUDA-compatible code).
 
     // GCC Flags accumulator.
     char gcc_flags[4096];
diff --git a/std/cuda.zc b/std/cuda.zc
new file mode 100644
index 0000000..c010302
--- /dev/null
+++ b/std/cuda.zc
@@ -0,0 +1,113 @@
+
+include <cuda_runtime.h>
+
+// Memory Management
+
+/// Allocate device memory for n elements of type T.
+fn cuda_alloc<T>(n: usize) -> T* {
+    var ptr: T* = NULL;
+    cudaMalloc((void**)&ptr, n * sizeof(T));
+    return ptr;
+}
+
+/// Free device memory.
+fn cuda_free(ptr: void*) {
+    cudaFree(ptr);
+}
+
+/// Copy bytes from host to device.
+fn cuda_copy_to_device(dst: void*, src: void*, bytes: usize) {
+    cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
+}
+
+/// Copy bytes from device to host.
+fn cuda_copy_to_host(dst: void*, src: void*, bytes: usize) {
+    cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
+}
+
+/// Copy bytes between device buffers.
+fn cuda_copy_device(dst: void*, src: void*, bytes: usize) {
+    cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice);
+}
+
+/// Set device memory to zero.
+fn cuda_zero(ptr: void*, bytes: usize) {
+    cudaMemset(ptr, 0, bytes);
+}
+
+// Synchronization
+
+/// Synchronize the device (wait for all kernels to complete).
+fn cuda_sync() {
+    cudaDeviceSynchronize();
+}
+
+// Thread Indexing (for use inside @global/@device functions)
+
+/// Get 1D global thread index (blockIdx.x * blockDim.x + threadIdx.x).
+@device @inline
+fn thread_id() -> int {
+    var id: int;
+    id = blockIdx.x * blockDim.x + threadIdx.x;
+    return id;
+}
+
+/// Get 1D block index.
+@device @inline
+fn block_id() -> int {
+    var id: int;
+    id = blockIdx.x;
+    return id;
+}
+
+/// Get 1D thread index within block.
+@device @inline
+fn local_id() -> int {
+    var id: int;
+    id = threadIdx.x;
+    return id;
+}
+
+/// Get block size (number of threads per block).
+@device @inline
+fn block_size() -> int {
+    var size: int;
+    size = blockDim.x;
+    return size;
+}
+
+/// Get grid size (number of blocks).
+@device @inline
+fn grid_size() -> int {
+    var size: int;
+    size = gridDim.x;
+    return size;
+}
+
+// Device Info
+
+/// Get the number of CUDA devices.
+fn cuda_device_count() -> int {
+    var count: int = 0;
+    cudaGetDeviceCount(&count);
+    return count;
+}
+
+/// Set the active CUDA device.
+fn cuda_set_device(id: int) {
+    cudaSetDevice(id);
+}
+
+// Error Handling
+
+/// Get the last CUDA error code.
+fn cuda_last_error() -> int {
+    var err: int;
+    err = (int)cudaGetLastError();
+    return err;
+}
+
+/// Check if last CUDA operation succeeded.
+fn cuda_ok() -> bool {
+    return cuda_last_error() == 0;
+}
diff --git a/std/mem.zc b/std/mem.zc
index 3e08e8b..45bc208 100644
--- a/std/mem.zc
+++ b/std/mem.zc
@@ -1,14 +1,14 @@
 
 fn alloc<T>() -> T* {
-    return malloc(sizeof(T));
+    return (T*)malloc(sizeof(T));
 }
 
 fn zalloc<T>() -> T* {
-    return calloc(1, sizeof(T));
+    return (T*)calloc(1, sizeof(T));
 }
 
 fn alloc_n<T>(n: usize) -> T* {
-    return malloc(sizeof(T) * n);
+    return (T*)malloc(sizeof(T) * n);
 }
 
 struct Box<T> {