From 6ee2fe04fb21c6c98c2618bd5d82fe36d858e1e4 Mon Sep 17 00:00:00 2001
From: kartofen <mladenovnasko0@gmail.com>
Date: Sun, 15 Jun 2025 23:53:17 +0300
Subject: testing

---
 build.sh           |   9 +++
 lexer.c            | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 recursive-ascent.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 443 insertions(+)
 create mode 100755 build.sh
 create mode 100644 lexer.c
 create mode 100644 recursive-ascent.c

diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..0a69ce7
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,9 @@
+#!/bin/sh 
+
+set -xe
+
+gcc -Wall -Wextra -g lexer.c -o lexer
+gcc -Wall -Wextra -g recursive-ascent.c -o recursive-ascent
+gcc -Wall -Wextra -g recursive-ascent-descent.c -o recursive-ascent-descent
+
+valgrind --leak-check=full --show-leak-kinds=all -s ./recursive-ascent-descent
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..6df77c6
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,225 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+// TODO: - make it more memory efficient by allocating only
+//         the need amount of level for each letter
+//       - add more than 64 bits "types" for more tokens
+//         and more characters
+//       - add easier way to write chars to bits (maybe a singe string)
+
+
+#define ARR_LENGTH(arr) (sizeof(arr)/sizeof(*arr))
+
+#define popcount(x) (__builtin_popcount(x))
+
+#define MAPPED_CHARS 32
+
+#define TOKENS(X)     \
+    X(TOKEN_NONE)     \
+    X(TOKEN_TEST)     \
+    X(TOKEN_RETARDED) \
+    X(TOKEN_WOW)      \
+    X(TOKEN_TITS)     \
+    X(TOKEN_RPAREN)   \
+    X(TOKEN_LPAREN)
+
+#define TOKEN_ENUM(a) a,
+#define TOKEN_STRING(a) #a,
+
+enum token {
+    TOKENS(TOKEN_ENUM)
+};
+
+const char * const to_string[] = {
+    TOKENS(TOKEN_STRING)
+};
+
+const struct {
+    char *s;
+    enum token t;
+} strings[] = {
+    {"test", TOKEN_TEST},
+    {"retarded", TOKEN_RETARDED},
+    {"wow", TOKEN_WOW},
+    {"tits", TOKEN_TITS},
+};
+
+const uint8_t char_to_bit[256] = {
+    ['a'] = 2,  ['b'] = 3,  ['c'] = 4,  ['d'] = 5,  ['e'] = 6,  ['f'] = 7,
+    ['g'] = 8,  ['h'] = 9,  ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
+    ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19,
+    ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25,
+    ['y'] = 26, ['z'] = 27, [ 0 ] = 1,  [' '] = 1
+};
+
+const enum token separators[] = {
+    ['{'] = TOKEN_LPAREN,
+    ['}'] = TOKEN_RPAREN
+};
+
+struct level {
+    uint64_t bit_mask;
+    uint64_t *token_masks;
+};
+
+struct level start_level = {0};
+struct level *bit_to_ptr[MAPPED_CHARS] = {0};
+static size_t num_levels;
+
+#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
+
+int compile_tables(void)
+{
+    // max number of levels
+    for(size_t i = 0; i < ARR_LENGTH(strings); i++)
+        if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
+
+    // allocated levels
+    for(size_t i = 0; i < MAPPED_CHARS; i++) {
+        bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i]));
+        if(!bit_to_ptr[i]) return 1;
+    }
+
+    // BUG: everything is repeated for the start_level
+
+    // populate bit_masks
+    for(size_t i = 0; i < ARR_LENGTH(strings); i++) {
+        struct level *l = &start_level;
+        for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+            uint8_t bit = char_to_bit[strings[i].s[j]];
+            
+            l->bit_mask |= 1 << bit;
+            l = &bit_to_ptr[bit][j];
+        }
+    }
+
+    // allocate token_masks
+    // NOTE: start_level alloc'd many times, so realloc is used
+    for(size_t i = 0; i < MAPPED_CHARS; i++) {
+        struct level *l = &start_level;
+        for(size_t j = 0; j < num_levels + 1; j++) {
+            l->token_masks = realloc(l->token_masks, popcount(l->bit_mask)
+                                    * sizeof(*l->token_masks));
+            memset(l->token_masks, 0, popcount(l->bit_mask)
+                   * sizeof(*l->token_masks));
+            
+            l = &bit_to_ptr[i][j];
+        }
+    }
+
+    // populate token_masks
+    for(size_t i = 0; i < ARR_LENGTH(strings); i++) {
+        struct level *l = &start_level;
+        for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+            uint8_t bit = char_to_bit[strings[i].s[j]];            
+            uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+
+            l->token_masks[idx] |= 1 << strings[i].t;
+            l = &bit_to_ptr[bit][j];
+        }
+    }
+    
+    return 0;
+}
+
+void print_tables(void)
+{
+    for(size_t i = 0; i < 256; i++) 
+        for(size_t j = 0; j < num_levels; j++)
+            if(CHAR_TO_PTR(i)[j].bit_mask) {
+                printf("%c, %d, %32b  ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
+
+                printf("{ ");
+                for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
+                    printf("%b ", CHAR_TO_PTR(i)[j].token_masks[k]);
+                printf(" }\n");
+            }
+
+        
+    printf("      %32b  ", start_level.bit_mask);
+    printf("{ ");
+    for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
+        printf("%b ", start_level.token_masks[k]);
+    printf(" }\n");
+
+    printf("      %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
+}
+
+int tokenize_string(char *string, enum token *t, size_t t_len)
+{
+    size_t ntokens = 0;
+    size_t i = 0;
+    size_t off = 0;
+
+    while(i < strlen(string) + 1) {
+        uint64_t token_mask = ~(uint64_t)0;
+        while(1) {
+            struct level *l = (i-off == 0)
+                ? &start_level
+                : &bit_to_ptr[char_to_bit[string[i-1]]][i-1-off];
+
+            uint8_t bit = (separators[string[i]])
+                ? 1
+                : char_to_bit[string[i]];
+
+            if((l->bit_mask & (1 << bit)) == 0) {
+                token_mask = 0;
+                while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++;
+                break;
+            }
+            
+            uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+            token_mask &= l->token_masks[idx];
+
+            if(bit == 1) break;
+            i++;
+        }
+        
+        // BUG: not checking of ntokens is in t_len
+        if(token_mask)
+            t[ntokens++] = __builtin_ctz(token_mask);
+        else if(off != i) t[ntokens++] = TOKEN_NONE;
+
+        if(separators[string[i]])
+            t[ntokens++] = separators[string[i]];
+
+        off = ++i;
+    }
+    
+    return ntokens;
+}
+
+void free_tables(void)
+{
+    free(start_level.token_masks);
+    for(size_t i = 0; i < MAPPED_CHARS; i++) {
+        for(size_t j = 0; j < num_levels; j++) {
+            if(bit_to_ptr[i][j].token_masks)
+                free(bit_to_ptr[i][j].token_masks);
+        }
+        free(bit_to_ptr[i]);
+    }
+}
+
+// exit with a given code, default is 1
+#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1)
+
+int main(void)
+{
+    compile_tables() && DIE();
+
+    enum token t[120] = {0};
+    size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120);
+
+    ntokens || DIE(10);
+    
+    for(size_t i = 0; i < ntokens; i++)
+        printf("%s\n", to_string[t[i]]);
+
+    print_tables();
+    free_tables();
+    return 0;
+}
diff --git a/recursive-ascent.c b/recursive-ascent.c
new file mode 100644
index 0000000..8020f7e
--- /dev/null
+++ b/recursive-ascent.c
@@ -0,0 +1,209 @@
+// file: rad_mixed_parser.c
+// Recursive Ascent–Descent parser for:
+//
+//    expr : expr '+' term
+//         | expr '-' term
+//         | term
+//
+//    term : '(' expr ')'
+//         | num
+//
+//    num  : '0' | '1'
+//
+// We implement:
+//  - A miniature LALR(1) ascent machine *just* for expr (+ and –).
+//  - A recursive‑descent helper for term/num.
+//  - No hand‑unrolling of left recursion!
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define DIE() do { \
+    fprintf(stderr, "Parse error at %s:%d\n", __FILE__, __LINE__); \
+    exit(1); \
+} while (0)
+
+enum nonterminal { EXPR, TERM, NUM };
+
+struct result {
+    enum nonterminal nonterm;
+    int               value;
+    int               depth;
+    char             *s;
+};
+
+struct args { int args[2]; };
+#define arg(...) (struct args){{__VA_ARGS__}}
+
+// forward declarations of ascent states for expr
+struct result state0(char *s, struct args a);
+struct result state4(char *s, struct args a);
+struct result state5(char *s, struct args a);
+struct result state7(char *s, struct args a);
+struct result state8(char *s, struct args a);
+struct result state9(char *s, struct args a);
+struct result state10(char *s, struct args a);
+struct result state12(char *s, struct args a);
+struct result state13(char *s, struct args a);
+
+// --- Recursive‑descent helper for term/num ------------------------------
+
+// Recognize exactly one `term` ( '(' expr ')'  or  '0' | '1' ).
+// Return the updated pointer and write the semantic value into *out.
+static char *
+descend_term(char *s, int *out)
+{
+    if (*s == '(') {
+        s++;  // consume '('
+        // call back into the ascent machine for the full expr
+        struct result r = state0(s, arg(0));
+        // drive that machine to its final EXPR reduction
+        while (r.depth == 0) {
+            switch (r.nonterm) {
+            case EXPR: r = state4(r.s, arg(r.value)); break;
+            default:    DIE();
+            }
+        }
+        // now r.nonterm==EXPR, r.depth==2, r.s points just past all of expr
+        r.depth--;  // pop the final accept
+        if (*r.s != ')') DIE();
+        *out = r.value;
+        return r.s + 1;
+    }
+    else if (*s == '0' || *s == '1') {
+        *out = (*s - '0');
+        return s + 1;
+    }
+    else {
+        DIE();
+    }
+    return s; // unreachable
+}
+
+// --- Ascent state machine for expr (“+” and “–” only) ------------------
+
+// state0: $accept: . expr $end
+//  dispatch to shift into the left‑recursive expr machine
+struct result state0(char *s, struct args a)
+{
+    struct result r = {0};
+    // Instead of shifting '(' or '0','1' directly,
+    // we “descend” for the initial term, then reduce to EXPR:
+    {
+        int t;
+        s = descend_term(s, &t);
+        r = (struct result){ TERM, t, 0, s };
+    }
+    // now fold up into EXPR until we reach a non‑term reduction
+    while (r.depth == 0) {
+        switch (r.nonterm) {
+        case EXPR: r = state4(r.s, arg(r.value)); break;
+        case TERM: r = state5(r.s, arg(r.value)); break;
+        default:    DIE();
+        }
+    }
+    r.depth--;
+    return r;
+}
+
+// state4: 0 $accept: expr . $end
+//         1 expr: expr . '+' term
+//         2 expr: expr . '-' term
+//  on '+' → state9, on '-' → state10, on '\0' → state8
+struct result state4(char *s, struct args a)
+{
+    struct result r = {0};
+    switch(*s) {
+    case '\0': r = state8(s+1,  a); break;
+    case '+':  r = state9(s+1,  a); break;
+    case '-':  r = state10(s+1, a); break;
+    default:   DIE();
+    }
+    if (r.depth == 0) DIE();
+    r.depth--;
+    return r;
+}
+
+// state5: 3 expr: term .
+//  reduce term→expr
+struct result state5(char *s, struct args a)
+{
+    return (struct result){ EXPR, a.args[0], 0, s };
+}
+
+// state7: after a nested expr in parentheses
+//   same as state4 but also handles ')'
+struct result state7(char *s, struct args a)
+{
+    struct result r = {0};
+    switch(*s) {
+    case '+': r = state9(s+1,  a); break;
+    case '-': r = state10(s+1, a); break;
+    case ')': r = state8(s+1,  a); break;  // fold up to expr→term inside ()
+    default:  DIE();
+    }
+    if (r.depth == 0) DIE();
+    r.depth--;
+    return r;
+}
+
+// state8: 0 $accept: expr $end .
+//  accept
+struct result state8(char *s, struct args a)
+{
+    return (struct result){ EXPR, a.args[0], 2, s };
+}
+
+// state9: 1 expr: expr '+' . term
+//   shift '+' then descend for *one* term, then reduce via state12
+struct result state9(char *s, struct args a)
+{
+    int t;
+    s = descend_term(s, &t);
+    return state12(s, arg(a.args[0], t));
+}
+
+// state10: 2 expr: expr '-' . term
+//   same as state9 but for '-'
+struct result state10(char *s, struct args a)
+{
+    int t;
+    s = descend_term(s, &t);
+    return state13(s, arg(a.args[0], t));
+}
+
+// state12:  1 expr: expr '+' term .
+//  reduce
+struct result state12(char *s, struct args a)
+{
+    return (struct result){ EXPR, a.args[0] + a.args[1], 2, s };
+}
+
+// state13:  2 expr: expr '-' term .
+//  reduce
+struct result state13(char *s, struct args a)
+{
+    return (struct result){ EXPR, a.args[0] - a.args[1], 2, s };
+}
+
+// --- main entry point ---------------------------------------------------
+
+int main(void)
+{
+    const char *input = "1+((0-1)+1)";
+    struct result r = state0((char*)input, arg(0));
+
+    // finish driving the ascent machine if needed
+    while (r.depth == 0) {
+        switch (r.nonterm) {
+        case EXPR: r = state4(r.s, arg(r.value)); break;
+        default:    DIE();
+        }
+    }
+    if (r.nonterm != EXPR || r.depth != 2 || *r.s != '\0') {
+        DIE();
+    }
+
+    printf("result = %d\n", r.value);
+    return 0;
+}
-- 
cgit v1.2.3