2 files changed, 38 insertions, 22 deletions
diff --git a/README.md b/README.md
index 706bbd9..ea85869 100644
--- a/README.md
+++ b/README.md
@@ -684,9 +684,9 @@ fn main() {
 
 > **Note:** The `--cpp` flag switches the backend to `g++` and emits C++-compatible code (uses `auto` instead of `__auto_type`, function overloads instead of `_Generic`, and explicit casts for `void*`).
 
-### CUDA Interop
+#### CUDA Interop
 
-Zen C can generate CUDA-compatible code with the `--cuda` flag, allowing you to write GPU kernels using Zen C syntax.
+Zen C supports GPU programming by transpiling to **CUDA C++**. This allows you to leverage powerful C++ features (templates, constexpr) within your kernels while maintaining Zen C's ergonomic syntax.
 
 ```bash
 # Direct compilation with nvcc
@@ -725,12 +725,10 @@ This transpiles to: `kernel_name<<<grid, block, shared, stream>>>(args);`
 Use Zen C function syntax with `@global` and the `launch` statement:
 
 ```zc
-include <cuda_runtime.h>
 import "std/cuda.zc"
 
 @global
 fn add_kernel(a: float*, b: float*, c: float*, n: int) {
-    // You can use raw CUDA C or the std/cuda.zc helpers
     var i = thread_id();
     if i < n {
         c[i] = a[i] + b[i];
@@ -738,12 +736,22 @@ fn add_kernel(a: float*, b: float*, c: float*, n: int) {
 }
 
 fn main() {
-    // ... allocation ...
+    const N = 1024;
+    var d_a = cuda_alloc<float>(N);
+    var d_b = cuda_alloc<float>(N); 
+    var d_c = cuda_alloc<float>(N);
+    defer cuda_free(d_a);
+    defer cuda_free(d_b);
+    defer cuda_free(d_c);
+
+    // ... init data ...
+    
     launch add_kernel(d_a, d_b, d_c, N) with {
-        grid: num_blocks,
+        grid: (N + 255) / 256,
         block: 256
     };
-    // ... cleanup ...
+    
+    cuda_sync();
 }
 ```
 
diff --git a/src/main.c b/src/main.c
index 2b7dcf0..44cf45d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -152,7 +152,7 @@ int main(int argc, char **argv)
         {
             strcpy(g_config.cc, "nvcc");
             g_config.use_cuda = 1;
-            g_config.use_cpp = 1; // CUDA implies C++ mode
+            g_config.use_cpp = 1; // CUDA implies C++ mode.
         }
         else if (strcmp(arg, "--check") == 0)
         {
@@ -270,11 +270,22 @@ int main(int argc, char **argv)
         return 0;
     }
 
-    // Codegen to C
-    FILE *out = fopen("out.c", "w");
+    // Determine temporary filename based on mode
+    const char *temp_source_file = "out.c";
+    if (g_config.use_cuda)
+    {
+        temp_source_file = "out.cu";
+    }
+    else if (g_config.use_cpp)
+    {
+        temp_source_file = "out.cpp";
+    }
+
+    // Codegen to C/C++/CUDA
+    FILE *out = fopen(temp_source_file, "w");
     if (!out)
     {
-        perror("fopen out.c");
+        perror("fopen temp output");
         return 1;
     }
 
@@ -285,10 +296,10 @@ int main(int argc, char **argv)
     {
         if (g_config.output_file)
         {
-            // If user specified -o, rename out.c to that
-            if (rename("out.c", g_config.output_file) != 0)
+            // If user specified -o, rename temp file to that
+            if (rename(temp_source_file, g_config.output_file) != 0)
             {
-                perror("rename out.c");
+                perror("rename output");
                 return 1;
             }
             if (!g_config.quiet)
@@ -300,7 +311,7 @@ int main(int argc, char **argv)
         {
             if (!g_config.quiet)
             {
-                printf("[zc] Transpiled to out.c\n");
+                printf("[zc] Transpiled to %s\n", temp_source_file);
             }
         }
         // Done, no C compilation
@@ -311,12 +322,9 @@ int main(int argc, char **argv)
     char cmd[8192];
     char *outfile = g_config.output_file ? g_config.output_file : "a.out";
 
-    // TCC-specific adjustments?
-    // Already handled by user passing --cc tcc
-
-    snprintf(cmd, sizeof(cmd), "%s %s %s %s %s -o %s out.c -lm %s -I./src %s", g_config.cc,
+    snprintf(cmd, sizeof(cmd), "%s %s %s %s %s -o %s %s -lm %s -I./src %s", g_config.cc,
              g_config.gcc_flags, g_cflags, g_config.is_freestanding ? "-ffreestanding" : "", "",
-             outfile, g_parser_ctx->has_async ? "-lpthread" : "", g_link_flags);
+             outfile, temp_source_file, g_parser_ctx->has_async ? "-lpthread" : "", g_link_flags);
 
     if (g_config.verbose)
     {
@@ -329,7 +337,7 @@ int main(int argc, char **argv)
         printf("C compilation failed.\n");
         if (!g_config.emit_c)
         {
-            remove("out.c");
+            remove(temp_source_file);
         }
         return 1;
     }
@@ -337,7 +345,7 @@ int main(int argc, char **argv)
     if (!g_config.emit_c)
     {
         // remove("out.c"); // Keep it for debugging for now or follow flag
-        remove("out.c");
+        remove(temp_source_file);
     }
 
     if (g_config.mode_run)