diff options
| author | Zuhaitz <zuhaitz.zechhub@gmail.com> | 2026-01-31 17:22:17 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-01-31 17:22:17 +0000 |
| commit | 962d659c61212b1a23acfe56dda7cb92b721feda (patch) | |
| tree | ba1637d3885213095b312f81a477c33b1ebca6aa /docs/lex.md | |
| parent | e521ee7d175393ef37579ebd61ccb7e8d56a397f (diff) | |
| parent | 91ed9fdd65e09bd6cd32e44dd07c390f2cf79c22 (diff) | |
Merge branch 'main' into main
Diffstat (limited to 'docs/lex.md')
| -rw-r--r-- | docs/lex.md | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/docs/lex.md b/docs/lex.md new file mode 100644 index 0000000..1cd70fd --- /dev/null +++ b/docs/lex.md @@ -0,0 +1,130 @@ +# Lexical Structure + +## Source Text + +Zen-C source code is encoded in UTF-8. + +## Grammar Notation + +The lexical grammar is defined using a notation similar to EBNF. +- `Rule ::= Production`: Defines a rule. +- `[ ... ]`: Character class. +- `*`: Zero or more repetitions. +- `+`: One or more repetitions. +- `?`: Zero or one occurrence. +- `|`: Alternation. +- `"..."` or `'...'`: Literal string/character. +- `~`: Negation (e.g., `~[\n]` means any character except newline). + +## Whitespace and Comments + +Whitespace separates tokens but is otherwise ignored. Comments are treated as whitespace. + +```text +Whitespace ::= [ \t\n\r]+ +Comment ::= LineComment | BlockComment + +LineComment ::= "//" ~[\n]* +BlockComment ::= "/*" (BlockComment | ~("*/"))* "*/" +``` + +## Identifiers + +Identifiers name entities such as variables, functions, and types. + +```text +Identifier ::= IdentifierStart IdentifierPart* +IdentifierStart ::= [a-zA-Z_] +IdentifierPart ::= [a-zA-Z0-9_] +``` + +## Literals + +### Integer Literals + +Integers can be decimal, hexadecimal, or binary. + +```text +IntegerLiteral ::= ( DecimalInt | HexInt | BinaryInt ) IntegerSuffix? + +DecimalInt ::= [0-9]+ +HexInt ::= "0x" [0-9a-fA-F]+ +BinaryInt ::= "0b" [01]+ + +IntegerSuffix ::= "u" | "L" | "u64" | ... +``` +*Note: The lexer technically consumes any alphanumeric sequence following a number as a suffix.* + +### Floating Point Literals + +```text +FloatLiteral ::= [0-9]+ "." [0-9]* FloatSuffix? + | [0-9]+ FloatSuffix + +FloatSuffix ::= "f" +``` + +### String Literals + +```text +StringLiteral ::= '"' StringChar* '"' +StringChar ::= ~["\\] | EscapeSequence +EscapeSequence ::= "\\" ( ["\\/bfnrt] | "u" HexDigit{4} ) +``` + +### F-Strings + +```text +FStringLiteral ::= 'f"' StringChar* '"' +``` + + +### Character Literals + +```text +CharLiteral ::= "'" ( ~['\\] | EscapeSequence ) "'" +``` + +## Keywords + +```text +Keyword ::= Declaration | Control | Special | BoolLiteral | NullLiteral | LogicOp + +Declaration ::= "let" | "def" | "fn" | "struct" | "enum" | "union" | "alias" + | "trait" | "impl" | "use" | "module" | "import" | "opaque" + +Control ::= "if" | "else" | "match" | "for" | "while" | "loop" + | "return" | "break" | "continue" | "guard" | "unless" + | "defer" | "async" | "await" | "try" | "catch" | "goto" + +Special ::= "asm" | "assert" | "test" | "sizeof" | "embed" | "comptime" + | "autofree" | "volatile" | "launch" | "ref" | "static" | "const" + +BoolLiteral ::= "true" | "false" +NullLiteral ::= "null" + +CReserved ::= "auto" | "case" | "char" | "default" | "do" | "double" + | "extern" | "float" | "inline" | "int" | "long" | "register" + | "restrict" | "short" | "signed" | "switch" | "typedef" + | "unsigned" | "void" | "_Atomic" | "_Bool" | "_Complex" + | "_Generic" | "_Imaginary" | "_lmaginary" | "_Noreturn" + | "_Static_assert" | "_Thread_local" + +LogicOp ::= "and" | "or" +``` + +## Operators and Punctuation + +```text +Operator ::= "+" | "-" | "*" | "/" | "%" + | "&&" | "||" | "!" | "++" | "--" + | "&" | "|" | "^" | "~" | "<<" | ">>" + | "==" | "!=" | "<" | ">" | "<=" | ">=" + | "=" | "+=" | "-=" | "*=" | "/=" | "%=" + | "&=" | "|=" | "^=" | "<<=" | ">>=" + | ".." | "..=" | "..<" | "..." + | "." | "?." | "??" | "??=" | "->" | "=>" + | "::" | "|>" | "?" + | "(" | ")" | "{" | "}" | "[" | "]" + | "," | ":" | ";" | "@" +``` |
