From 6ee2fe04fb21c6c98c2618bd5d82fe36d858e1e4 Mon Sep 17 00:00:00 2001 From: kartofen Date: Sun, 15 Jun 2025 23:53:17 +0300 Subject: testing --- build.sh | 9 +++ lexer.c | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++++ recursive-ascent.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 443 insertions(+) create mode 100755 build.sh create mode 100644 lexer.c create mode 100644 recursive-ascent.c diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..0a69ce7 --- /dev/null +++ b/build.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +set -xe + +gcc -Wall -Wextra -g lexer.c -o lexer +gcc -Wall -Wextra -g recursive-ascent.c -o recursive-ascent +gcc -Wall -Wextra -g recursive-ascent-descent.c -o recursive-ascent-descent + +valgrind --leak-check=full --show-leak-kinds=all -s ./recursive-ascent-descent diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..6df77c6 --- /dev/null +++ b/lexer.c @@ -0,0 +1,225 @@ +#include +#include +#include +#include +#include + +// TODO: - make it more memory efficient by allocating only +// the need amount of level for each letter +// - add more than 64 bits "types" for more tokens +// and more characters +// - add easier way to write chars to bits (maybe a singe string) + + +#define ARR_LENGTH(arr) (sizeof(arr)/sizeof(*arr)) + +#define popcount(x) (__builtin_popcount(x)) + +#define MAPPED_CHARS 32 + +#define TOKENS(X) \ + X(TOKEN_NONE) \ + X(TOKEN_TEST) \ + X(TOKEN_RETARDED) \ + X(TOKEN_WOW) \ + X(TOKEN_TITS) \ + X(TOKEN_RPAREN) \ + X(TOKEN_LPAREN) + +#define TOKEN_ENUM(a) a, +#define TOKEN_STRING(a) #a, + +enum token { + TOKENS(TOKEN_ENUM) +}; + +const char * const to_string[] = { + TOKENS(TOKEN_STRING) +}; + +const struct { + char *s; + enum token t; +} strings[] = { + {"test", TOKEN_TEST}, + {"retarded", TOKEN_RETARDED}, + {"wow", TOKEN_WOW}, + {"tits", TOKEN_TITS}, +}; + +const uint8_t char_to_bit[256] = { + ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, + ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, + ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19, + ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25, + ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 +}; + +const enum token separators[] = { + ['{'] = TOKEN_LPAREN, + ['}'] = TOKEN_RPAREN +}; + +struct level { + uint64_t bit_mask; + uint64_t *token_masks; +}; + +struct level start_level = {0}; +struct level *bit_to_ptr[MAPPED_CHARS] = {0}; +static size_t num_levels; + +#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) + +int compile_tables(void) +{ + // max number of levels + for(size_t i = 0; i < ARR_LENGTH(strings); i++) + if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); + + // allocated levels + for(size_t i = 0; i < MAPPED_CHARS; i++) { + bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i])); + if(!bit_to_ptr[i]) return 1; + } + + // BUG: everything is repeated for the start_level + + // populate bit_masks + for(size_t i = 0; i < ARR_LENGTH(strings); i++) { + struct level *l = &start_level; + for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { + uint8_t bit = char_to_bit[strings[i].s[j]]; + + l->bit_mask |= 1 << bit; + l = &bit_to_ptr[bit][j]; + } + } + + // allocate token_masks + // NOTE: start_level alloc'd many times, so realloc is used + for(size_t i = 0; i < MAPPED_CHARS; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < num_levels + 1; j++) { + l->token_masks = realloc(l->token_masks, popcount(l->bit_mask) + * sizeof(*l->token_masks)); + memset(l->token_masks, 0, popcount(l->bit_mask) + * sizeof(*l->token_masks)); + + l = &bit_to_ptr[i][j]; + } + } + + // populate token_masks + for(size_t i = 0; i < ARR_LENGTH(strings); i++) { + struct level *l = &start_level; + for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { + uint8_t bit = char_to_bit[strings[i].s[j]]; + uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); + + l->token_masks[idx] |= 1 << strings[i].t; + l = &bit_to_ptr[bit][j]; + } + } + + return 0; +} + +void print_tables(void) +{ + for(size_t i = 0; i < 256; i++) + for(size_t j = 0; j < num_levels; j++) + if(CHAR_TO_PTR(i)[j].bit_mask) { + printf("%c, %d, %32b ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); + + printf("{ "); + for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) + printf("%b ", CHAR_TO_PTR(i)[j].token_masks[k]); + printf(" }\n"); + } + + + printf(" %32b ", start_level.bit_mask); + printf("{ "); + for(size_t k = 0; k < popcount(start_level.bit_mask); k++) + printf("%b ", start_level.token_masks[k]); + printf(" }\n"); + + printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); +} + +int tokenize_string(char *string, enum token *t, size_t t_len) +{ + size_t ntokens = 0; + size_t i = 0; + size_t off = 0; + + while(i < strlen(string) + 1) { + uint64_t token_mask = ~(uint64_t)0; + while(1) { + struct level *l = (i-off == 0) + ? &start_level + : &bit_to_ptr[char_to_bit[string[i-1]]][i-1-off]; + + uint8_t bit = (separators[string[i]]) + ? 1 + : char_to_bit[string[i]]; + + if((l->bit_mask & (1 << bit)) == 0) { + token_mask = 0; + while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++; + break; + } + + uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); + token_mask &= l->token_masks[idx]; + + if(bit == 1) break; + i++; + } + + // BUG: not checking of ntokens is in t_len + if(token_mask) + t[ntokens++] = __builtin_ctz(token_mask); + else if(off != i) t[ntokens++] = TOKEN_NONE; + + if(separators[string[i]]) + t[ntokens++] = separators[string[i]]; + + off = ++i; + } + + return ntokens; +} + +void free_tables(void) +{ + free(start_level.token_masks); + for(size_t i = 0; i < MAPPED_CHARS; i++) { + for(size_t j = 0; j < num_levels; j++) { + if(bit_to_ptr[i][j].token_masks) + free(bit_to_ptr[i][j].token_masks); + } + free(bit_to_ptr[i]); + } +} + +// exit with a given code, default is 1 +#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1) + +int main(void) +{ + compile_tables() && DIE(); + + enum token t[120] = {0}; + size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120); + + ntokens || DIE(10); + + for(size_t i = 0; i < ntokens; i++) + printf("%s\n", to_string[t[i]]); + + print_tables(); + free_tables(); + return 0; +} diff --git a/recursive-ascent.c b/recursive-ascent.c new file mode 100644 index 0000000..8020f7e --- /dev/null +++ b/recursive-ascent.c @@ -0,0 +1,209 @@ +// file: rad_mixed_parser.c +// Recursive Ascent–Descent parser for: +// +// expr : expr '+' term +// | expr '-' term +// | term +// +// term : '(' expr ')' +// | num +// +// num : '0' | '1' +// +// We implement: +// - A miniature LALR(1) ascent machine *just* for expr (+ and –). +// - A recursive‑descent helper for term/num. +// - No hand‑unrolling of left recursion! + +#include +#include + +#define DIE() do { \ + fprintf(stderr, "Parse error at %s:%d\n", __FILE__, __LINE__); \ + exit(1); \ +} while (0) + +enum nonterminal { EXPR, TERM, NUM }; + +struct result { + enum nonterminal nonterm; + int value; + int depth; + char *s; +}; + +struct args { int args[2]; }; +#define arg(...) (struct args){{__VA_ARGS__}} + +// forward declarations of ascent states for expr +struct result state0(char *s, struct args a); +struct result state4(char *s, struct args a); +struct result state5(char *s, struct args a); +struct result state7(char *s, struct args a); +struct result state8(char *s, struct args a); +struct result state9(char *s, struct args a); +struct result state10(char *s, struct args a); +struct result state12(char *s, struct args a); +struct result state13(char *s, struct args a); + +// --- Recursive‑descent helper for term/num ------------------------------ + +// Recognize exactly one `term` ( '(' expr ')' or '0' | '1' ). +// Return the updated pointer and write the semantic value into *out. +static char * +descend_term(char *s, int *out) +{ + if (*s == '(') { + s++; // consume '(' + // call back into the ascent machine for the full expr + struct result r = state0(s, arg(0)); + // drive that machine to its final EXPR reduction + while (r.depth == 0) { + switch (r.nonterm) { + case EXPR: r = state4(r.s, arg(r.value)); break; + default: DIE(); + } + } + // now r.nonterm==EXPR, r.depth==2, r.s points just past all of expr + r.depth--; // pop the final accept + if (*r.s != ')') DIE(); + *out = r.value; + return r.s + 1; + } + else if (*s == '0' || *s == '1') { + *out = (*s - '0'); + return s + 1; + } + else { + DIE(); + } + return s; // unreachable +} + +// --- Ascent state machine for expr (“+” and “–” only) ------------------ + +// state0: $accept: . expr $end +// dispatch to shift into the left‑recursive expr machine +struct result state0(char *s, struct args a) +{ + struct result r = {0}; + // Instead of shifting '(' or '0','1' directly, + // we “descend” for the initial term, then reduce to EXPR: + { + int t; + s = descend_term(s, &t); + r = (struct result){ TERM, t, 0, s }; + } + // now fold up into EXPR until we reach a non‑term reduction + while (r.depth == 0) { + switch (r.nonterm) { + case EXPR: r = state4(r.s, arg(r.value)); break; + case TERM: r = state5(r.s, arg(r.value)); break; + default: DIE(); + } + } + r.depth--; + return r; +} + +// state4: 0 $accept: expr . $end +// 1 expr: expr . '+' term +// 2 expr: expr . '-' term +// on '+' → state9, on '-' → state10, on '\0' → state8 +struct result state4(char *s, struct args a) +{ + struct result r = {0}; + switch(*s) { + case '\0': r = state8(s+1, a); break; + case '+': r = state9(s+1, a); break; + case '-': r = state10(s+1, a); break; + default: DIE(); + } + if (r.depth == 0) DIE(); + r.depth--; + return r; +} + +// state5: 3 expr: term . +// reduce term→expr +struct result state5(char *s, struct args a) +{ + return (struct result){ EXPR, a.args[0], 0, s }; +} + +// state7: after a nested expr in parentheses +// same as state4 but also handles ')' +struct result state7(char *s, struct args a) +{ + struct result r = {0}; + switch(*s) { + case '+': r = state9(s+1, a); break; + case '-': r = state10(s+1, a); break; + case ')': r = state8(s+1, a); break; // fold up to expr→term inside () + default: DIE(); + } + if (r.depth == 0) DIE(); + r.depth--; + return r; +} + +// state8: 0 $accept: expr $end . +// accept +struct result state8(char *s, struct args a) +{ + return (struct result){ EXPR, a.args[0], 2, s }; +} + +// state9: 1 expr: expr '+' . term +// shift '+' then descend for *one* term, then reduce via state12 +struct result state9(char *s, struct args a) +{ + int t; + s = descend_term(s, &t); + return state12(s, arg(a.args[0], t)); +} + +// state10: 2 expr: expr '-' . term +// same as state9 but for '-' +struct result state10(char *s, struct args a) +{ + int t; + s = descend_term(s, &t); + return state13(s, arg(a.args[0], t)); +} + +// state12: 1 expr: expr '+' term . +// reduce +struct result state12(char *s, struct args a) +{ + return (struct result){ EXPR, a.args[0] + a.args[1], 2, s }; +} + +// state13: 2 expr: expr '-' term . +// reduce +struct result state13(char *s, struct args a) +{ + return (struct result){ EXPR, a.args[0] - a.args[1], 2, s }; +} + +// --- main entry point --------------------------------------------------- + +int main(void) +{ + const char *input = "1+((0-1)+1)"; + struct result r = state0((char*)input, arg(0)); + + // finish driving the ascent machine if needed + while (r.depth == 0) { + switch (r.nonterm) { + case EXPR: r = state4(r.s, arg(r.value)); break; + default: DIE(); + } + } + if (r.nonterm != EXPR || r.depth != 2 || *r.s != '\0') { + DIE(); + } + + printf("result = %d\n", r.value); + return 0; +} -- cgit v1.2.3