diff options
| -rw-r--r-- | std.zc | 1 | ||||
| -rw-r--r-- | std/regex.zc | 198 | ||||
| -rw-r--r-- | tests/std/test_regex.zc | 187 |
3 files changed, 386 insertions, 0 deletions
@@ -18,5 +18,6 @@ import "./std/stack.zc" import "./std/queue.zc" import "./std/env.zc" import "./std/slice.zc" +import "./std/regex.zc" import "./std/process.zc" diff --git a/std/regex.zc b/std/regex.zc new file mode 100644 index 0000000..f64b36e --- /dev/null +++ b/std/regex.zc @@ -0,0 +1,198 @@ +include <regex.h> + +import "./core.zc" +import "./string.zc" +import "./vec.zc" +import "./option.zc" + +struct Match { + text: char*; + start: int; + len: int; +} + +impl Match { + fn new(text: char*, start: int, len: int) -> Match { + return Match { text: text, start: start, len: len }; + } + + fn as_string(self) -> char* { + return self.text; + } + + fn end(self) -> int { + return self.start + self.len; + } +} + +struct Regex { + preg: void*; + pattern: char*; + flags: int; +} + +impl Regex { + fn compile(pattern: char*) -> Regex { + return Regex::compile_with_flags(pattern, 1 | 2); + } + + fn compile_with_flags(pattern: char*, flags: int) -> Regex { + let preg = malloc(1024); + let status = regcomp(preg, pattern, flags); + if (status != 0) { + free(preg); + return Regex { preg: 0, pattern: 0, flags: flags }; + } + return Regex { preg: preg, pattern: pattern, flags: flags }; + } + + fn is_valid(self) -> bool { + return self.preg != 0; + } + + fn match(self, text: char*) -> bool { + if (self.preg == 0) { return false; } + return regexec(self.preg, text, 0, 0, 0) == 0; + } + + fn match_full(self, text: char*) -> bool { + return self.match(text); + } + + fn match_at(self, text: char*, offset: int) -> bool { + if (self.preg == 0) { return false; } + let len = strlen(text); + if (offset < 0 || offset > len) { return false; } + return regexec(self.preg, text + offset, 0, 0, 0) == 0; + } + + fn is_match(self, text: char*) -> bool { + return self.match(text); + } + + fn find(self, text: char*) -> Option<Match> { + if (self.preg == 0) { return Option<Match>::None(); } + let t_len = strlen(text); + for (let i = 0; i <= t_len; i = i + 1) { + let sub = text + i; + if (regexec(self.preg, sub, 0, 0, 0) == 0) { + let j = 0; + while (text[i + j] != 0 && regexec(self.preg, sub, 0, 0, 0) == 0) { + j = j + 1; + sub = text + i + j; + } + return Option<Match>::Some(Match::new(text + i, i, j)); + } + } + return Option<Match>::None(); + } + + fn find_at(self, text: char*, start: int) -> Option<Match> { + let len = strlen(text); + if (start < 0 || start >= len) { + return Option<Match>::None(); + } + return self.find(text + start); + } + + fn count(self, text: char*) -> int { + if (self.preg == 0) { return 0; } + let count = 0; + let pos = 0; + let t_len = strlen(text); + while (pos < t_len) { + let sub = text + pos; + if (regexec(self.preg, sub, 0, 0, 0) == 0) { + count = count + 1; + pos = pos + 1; + } else { + break; + } + } + return count; + } + + fn split(self, text: char*) -> Vec<String> { + let parts = Vec<String>::new(); + if (self.preg == 0) { + parts.push(String::from(text)); + return parts; + } + let t_len = strlen(text); + let last_pos = 0; + let pos = 0; + while (pos < t_len) { + let sub = text + pos; + if (regexec(self.preg, sub, 0, 0, 0) == 0) { + if (pos > last_pos) { + let before = text + last_pos; + let part_len = pos - last_pos; + let v = Vec<char>::new(); + for (let i = 0; i < part_len; i = i + 1) { + v.push(before[i]); + } + v.push(0); + parts.push(String { vec: v }); + } + last_pos = pos + 1; + pos = pos + 1; + } else { + pos = pos + 1; + } + } + if (last_pos < t_len) { + parts.push(String::from(text + last_pos)); + } + return parts; + } + + fn pattern(self) -> char* { + return self.pattern; + } + + fn flags(self) -> int { + return self.flags; + } + + fn is_valid_pattern(pattern: char*) -> bool { + let test_regex = Regex::compile(pattern); + let valid = test_regex.is_valid(); + test_regex.destroy(); + return valid; + } + + fn destroy(self) { + if (self.preg != 0) { + regfree(self.preg); + free(self.preg); + } + } +} + +fn regex_match(pattern: char*, text: char*) -> bool { + let re = Regex::compile(pattern); + let result = re.match(text); + re.destroy(); + return result; +} + +fn regex_find(pattern: char*, text: char*) -> Option<Match> { + let re = Regex::compile(pattern); + let result = re.find(text); + re.destroy(); + return result; +} + +fn regex_count(pattern: char*, text: char*) -> int { + let re = Regex::compile(pattern); + let count = re.count(text); + re.destroy(); + return count; +} + +fn regex_split(pattern: char*, text: char*) -> Vec<String> { + let re = Regex::compile(pattern); + let parts = re.split(text); + re.destroy(); + return parts; +} diff --git a/tests/std/test_regex.zc b/tests/std/test_regex.zc new file mode 100644 index 0000000..4fe176c --- /dev/null +++ b/tests/std/test_regex.zc @@ -0,0 +1,187 @@ +import "std/regex.zc" + +fn test_basic_matching() { + "testing: basic matching"; + let re = Regex::compile("abc"); + + if (re.match("abc")) { "literal match works"; } else { "FAILED: literal match"; } + if (re.match("abcdef")) { "substring match works"; } else { "FAILED: substring match"; } + if (!re.match("xyz")) { "not matching correctly returns false"; } else { "FAILED: mismatching"; } + + re.destroy(); + ""; +} + +fn test_anchors() { + "testing: anchors"; + let re = Regex::compile("^start"); + + if (re.match("start here")) { " ^ anchor works for start"; } else { "FAILED: ^ anchor start"; } + if (!re.match("no start")) { " ^ anchor rejects non-start"; } else { "FAILED: ^ anchor reject"; } + + re.destroy(); + + let re2 = Regex::compile("end$"); + if (re2.match("the end")) { " $ anchor works for end"; } else { "FAILED: $ anchor end"; } + if (!re2.match("end here")) { " $ anchor rejects non-end"; } else { "FAILED: $ anchor reject"; } + + re2.destroy(); + ""; +} + +fn test_wildcards() { + "testing: wild cards"; + let re = Regex::compile("a.c"); + + if (re.match("abc")) { " . matches single char"; } else { "FAILED: . match 1"; } + if (re.match("axc")) { " . matches different char"; } else { "FAILED: . match 2"; } + if (!re.match("ac")) { " . requires exactly one char"; } else { "FAILED: . match 3"; } + + re.destroy(); + ""; +} + +fn test_quantifiers() { + "testing: quantifiers"; + let re1 = Regex::compile("a*b"); + if (re1.match("b")) { " * matches zero occurrences"; } else { "FAILED: * 0"; } + if (re1.match("ab")) { " * matches one occurrence"; } else { "FAILED: * 1"; } + if (re1.match("aaab")) { " * matches multiple occurrences"; } else { "FAILED: * many"; } + re1.destroy(); + + let re2 = Regex::compile("a+b"); + if (!re2.match("b")) { " + requires at least one"; } else { "FAILED: + 0"; } + if (re2.match("ab")) { " + matches one occurrence"; } else { "FAILED: + 1"; } + if (re2.match("aaab")) { " + matches multiple occurrences"; } else { "FAILED: + many"; } + re2.destroy(); + + let re3 = Regex::compile("colou?r"); + if (re3.match("color")) { " ? matches with char"; } else { "FAILED: ? with"; } + if (re3.match("colour")) { " ? matches without char"; } else { "FAILED: ? without"; } + re3.destroy(); + ""; +} + +fn test_character_classes() { + "testing: character class stuff" + let re = Regex::compile("[0-9]+"); + + if (re.match("123")) { " [0-9] matches digits"; } else { "FAILED: [0-9] match"; } + if (re.match("abc123")) { " [0-9] finds digits in string"; } else { "FAILED: [0-9] find"; } + if (!re.match("abc")) { " [0-9] rejects non-digits"; } else { "FAILED: [0-9] reject"; } + + re.destroy(); + ""; +} + +fn test_alternation() { + "test: alternation"; + let re = Regex::compile("cat|dog"); + + if (re.match("cat")) { " | matches first alternative"; } else { "FAILED: | match 1"; } + if (re.match("dog")) { " | matches second alternative"; } else { "FAILED: | match 2"; } + if (!re.match("bird")) { " | rejects non-matching"; } else { "FAILED: | reject"; } + + re.destroy(); + ""; +} + +fn test_word_boundaries() { + "testing: word matching"; + let re = Regex::compile("[a-zA-Z]+"); + + if (re.match("hello")) { " letter class matches words"; } else { "FAILED: letter match"; } + if (re.match("hello123")) { " letter class finds word part"; } else { "FAILED: letter part"; } + if (!re.match("123")) { " letter class rejects non-letters"; } else { "FAILED: letter reject"; } + + re.destroy(); + ""; +} + +fn test_is_valid() { + "testing: patern validation" + + if (Regex::is_valid_pattern("^[a-z]+$")) { " valid pattern accepted"; } else { "FAILED: pattern validation 1"; } + if (Regex::is_valid_pattern("(hello|world)")) { " complex pattern accepted"; } else { "FAILED: pattern validation 2"; } + + ""; +} + +fn test_find() { + "testing: find functionality"; + let re = Regex::compile("[0-9]+"); + let m = re.find("abc123def456"); + + if (m.is_some()) { " find locates match"; } else { "FAILED: find match"; } + + re.destroy(); + ""; +} + +fn test_count() { + "testing: count"; + let re = Regex::compile("[0-9]+"); + let count = re.count("123 456 789"); + + if (count >= 1) { " count finds matches"; } else { "FAILED: count matches"; } + + re.destroy(); + ""; +} + +fn test_convenience_functions() { + "testing: just some other functions and stuff"; + + if (regex_match("^test", "testing")) { " regex_match works"; } else { "FAILED: regex_match"; } + if (regex_count("a", "banana") >= 1) { " regex_count works"; } else { "FAILED: regex_count"; } + + let m = regex_find("[0-9]+", "id: 42"); + if (m.is_some()) { " regex_find works"; } else { "FAILED: regex_find"; } + + ""; +} + +fn test_email_pattern() { + "test: email pattern stuff" + let email_re = Regex::compile("^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z][a-zA-Z]+$"); + + if (email_re.match("swag@swag.com")) { " valid email accepted"; } else { "FAILED: valid email"; } + if (email_re.match("swag.swag@swag.swag.swag")) { " complex email accepted"; } else { "FAILED: complex email"; } + if (!email_re.match("invalid.email")) { " invalid email rejected"; } else { "FAILED: invalid email reject"; } + + email_re.destroy(); + ""; +} + +fn test_url_pattern() { + "testing: url pattern stuff" + let url_re = Regex::compile("https?://[a-zA-Z0-9.-]+"); + + if (url_re.match("http://example.com")) { " http url matched matched"; } else { "FAILED: http url"; } + if (url_re.match("https://secure.example.com")) { " https url matched"; } else { "FAILED: https url"; } + if (!url_re.match("ftp://something.com")) { " ftp url rejected"; } else { "FAILED: ftp url reject"; } + + url_re.destroy(); + ""; +} + +fn main() { + "testing...."; + + test_basic_matching(); + test_anchors(); + test_wildcards(); + test_quantifiers(); + test_character_classes(); + test_alternation(); + test_word_boundaries(); + test_is_valid(); + test_find(); + test_count(); + test_convenience_functions(); + test_email_pattern(); + test_url_pattern(); + + "all tests worked... (hopefully.. look around for \"FAILED\" messages)"; + ""; +} |
