genearate parser options and lexer and dect separation

author: kartofen <mladenovnasko0@gmail.com> 2025-07-08 00:21:07 +0300
committer: kartofen <mladenovnasko0@gmail.com> 2025-07-08 00:21:07 +0300
commit: 919611902c39fd70afe1162883ee6bfd34f2642e (patch)
tree: 983528d33ed57e3ebc7792971507e6f9fff383ff /lexer.c
parent: a6cb97af1a2a5491f54fcfa7064641dc0aafd898 (diff)
1 files changed, 62 insertions, 210 deletions
diff --git a/lexer.c b/lexer.c
index 2fca9a7..7ebf2e7 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,233 +1,85 @@
 #include <stdio.h>
-#include <stdint.h>
-#include <string.h>
 #include <stdlib.h>
-#include <assert.h>
-
-// TODO: - make it more memory efficient by allocating only
-//         the need amount of level for each letter
-//       - add more than 64 bits "types" for more tokens
-//         and more characters
-//       - add easier way to write chars to bits (maybe a singe string)
-
-#define ARR_LEN(arr) (sizeof(arr) / sizeof(*arr))
-
-typedef int token;
-extern const char *const token_to_string[];
-extern const struct string_token { char *s; token t;} strings[];
-extern const token separators[];
-
-#ifdef _LEXER_STANDALONE
-
-#define TOKENS(X)     \
-    X(TOKEN_NONE)     \
-    X(TOKEN_TEST)     \
-    X(TOKEN_RETARDED) \
-    X(TOKEN_WOW)      \
-    X(TOKEN_TITS)     \
-    X(TOKEN_RPAREN)   \
-    X(TOKEN_LPAREN)
-
-#define TOKEN_ENUM(a) a,
-#define TOKEN_STRING(a) #a,
-
-enum token {
-    TOKENS(TOKEN_ENUM)
-};
-
-const char * const token_to_string[] = {
-    TOKENS(TOKEN_STRING)
-};
-
-const struct string_token {
-    char *s;
-    token t;
-} strings[] = {
-    {"test", TOKEN_TEST},
-    {"retarded", TOKEN_RETARDED},
-    {"wow", TOKEN_WOW},
-    {"tits", TOKEN_TITS},
-};
-
-const token separators[] = {['{'] = TOKEN_LPAREN, ['}'] = TOKEN_RPAREN};
-
-#endif
-
-const uint8_t char_to_bit[256] = {
-    ['a'] = 2,  ['b'] = 3,  ['c'] = 4,  ['d'] = 5,  ['e'] = 6,  ['f'] = 7,
-    ['g'] = 8,  ['h'] = 9,  ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
-    ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19,
-    ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25,
-    ['y'] = 26, ['z'] = 27, [ 0 ] = 1,  [' '] = 1
-};
+#include <ctype.h>
+#include <string.h>
 
-struct level {
-    uint64_t bit_mask;
-    uint64_t *token_masks;
+struct token {
+    enum symbol {
+        LPAREN, RPAREN, STRING, IDEN, NUM
+    } sym;
+    
+    union {
+        char *iden;
+        int i;
+        char *str;
+    };
 };
 
-#define MAPPED_CHARS 32
-static struct level start_level = {0};
-static struct level *bit_to_ptr[MAPPED_CHARS] = {0};
-static size_t num_levels;
-
-#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
-
-#define popcount(x) (__builtin_popcount(x))
+static struct token tok;
 
-int compile_lextables(void)
+static inline int issep(char c)
 {
-    // max number of levels
-    for(size_t i = 0; i < ARR_LEN(strings); i++)
-        if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
-
-    // allocated levels
-    for(size_t i = 0; i < MAPPED_CHARS; i++) {
-        bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i]));
-        if(!bit_to_ptr[i]) return 1;
-    }
-
-    // BUG: everything is repeated for the start_level
-
-    // populate bit_masks
-    for(size_t i = 0; i < ARR_LEN(strings); i++) {
-        struct level *l = &start_level;
-        for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
-            uint8_t bit = char_to_bit[strings[i].s[j]];
-
-            l->bit_mask |= 1 << bit;
-            l = &bit_to_ptr[bit][j];
-        }
-    }
-
-    // allocate token_masks
-    // NOTE: start_level alloc'd many times, so realloc is used
-    for(size_t i = 0; i < MAPPED_CHARS; i++) {
-        struct level *l = &start_level;
-        for(size_t j = 0; j < num_levels + 1; j++) {
-            l->token_masks = realloc(l->token_masks, popcount(l->bit_mask)
-                                    * sizeof(*l->token_masks));
-            memset(l->token_masks, 0, popcount(l->bit_mask)
-                   * sizeof(*l->token_masks));
-
-            l = &bit_to_ptr[i][j];
-        }
-    }
-
-    // populate token_masks
-    for(size_t i = 0; i < ARR_LEN(strings); i++) {
-        struct level *l = &start_level;
-        for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
-            uint8_t bit = char_to_bit[strings[i].s[j]];
-            uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
-
-            l->token_masks[idx] |= 1 << strings[i].t;
-            l = &bit_to_ptr[bit][j];
-        }
-    }
-
-    return 0;
+    return isspace(c) || c == '\0' || c == '}' || c == '{' || c == '"';
 }
-
-void print_lextables(void)
+    
+static inline int tillsep(char *str)
 {
-    for(size_t i = 0; i < 256; i++)
-        for(size_t j = 0; j < num_levels; j++)
-            if(CHAR_TO_PTR(i)[j].bit_mask) {
-                printf("%c, %zu, %32lb  ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
-
-                printf("{ ");
-                for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
-                    printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]);
-                printf(" }\n");
-            }
-
-
-    printf("      %32lb  ", start_level.bit_mask);
-    printf("{ ");
-    for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
-        printf("%lb ", start_level.token_masks[k]);
-    printf(" }\n");
-
-    printf("      %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
+    size_t i = 0;
+    while(!issep(str[i++]));
+    return i-1;
 }
 
-int tokenize_string(char *string, token *t, size_t t_len)
+static inline char *substring(char *str, size_t sub_end)
 {
-    size_t ntokens = 0;
-    size_t i = 0;
-    size_t off = 0;
-
-    while(i < strlen(string) + 1) {
-        uint64_t token_mask = ~(uint64_t)0;
-        while(1) {
-            struct level *l = (i-off == 0)
-                ? &start_level
-                : &bit_to_ptr[char_to_bit[string[i-1]]][i-1-off];
-
-            uint8_t bit = (separators[string[i]])
-                ? 1
-                : char_to_bit[string[i]];
-
-            if((l->bit_mask & (1 << bit)) == 0) {
-                token_mask = 0;
-                while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++;
-                break;
-            }
-
-            uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
-            token_mask &= l->token_masks[idx];
-
-            if(bit == 1) break;
-            i++;
-        }
-
-        // BUG: not checking of ntokens is in t_len
-        if(token_mask)
-            t[ntokens++] = __builtin_ctz(token_mask);
-        else if(off != i) t[ntokens++] = TOKEN_NONE;
-
-        if(separators[string[i]])
-            t[ntokens++] = separators[string[i]];
-
-        off = ++i;
-    }
-
-    return ntokens;
+    static char sub[128];
+    if(sub_end+1 > sizeof(sub)) return NULL;
+    
+    sub[sub_end+1] = '\0';
+    return memcpy(sub, str, sub_end);
 }
 
-void free_lextables(void)
+static char *next_token(char *str)
 {
-    free(start_level.token_masks);
-    for(size_t i = 0; i < MAPPED_CHARS; i++) {
-        for(size_t j = 0; j < num_levels; j++) {
-            if(bit_to_ptr[i][j].token_masks)
-                free(bit_to_ptr[i][j].token_masks);
+    size_t off = 0;
+    char c0 = str[0];
+
+    if(c0 == '\0')  return NULL;
+    if(isspace(c0)) return next_token(str+1);
+    else {
+        off = tillsep(str);
+        if(off == 0) { // sep
+            switch(str[off++]) {
+            case '{': tok.sym = LPAREN; break;
+            case '}': tok.sym = RPAREN; break;
+            case '"':
+                while(str[off] != '"') if(str[off++] == '\0') return NULL;
+                tok.sym = STRING;
+                tok.str = strdup(substring(str, off));
+            }
+        } else if(isalpha(c0)) { // iden
+            tok.sym = IDEN;
+            tok.iden = strdup(substring(str, off));
+        } else if(c0 >= '0' && c0 <= '9') { // num
+            tok.sym = NUM;
+            tok.i = atoi(substring(str, off));
         }
-        free(bit_to_ptr[i]);
     }
+    
+    return str+off;
 }
 
-#ifdef _LEXER_STANDALONE
-
-// exit with a given code, default is 1
-#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1)
-
 int main(void)
 {
-    compile_lextables() && DIE();
-
-    token t[120] = {0};
-    size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120);
-
-    ntokens || DIE(10);
-
-    for(size_t i = 0; i < ntokens; i++)
-        printf("%s\n", token_to_string[t[i]]);
-
-    print_lextables();
-    free_lextables();
+    char *str = "blah 0 1 443 test{here}13}{1\"fdlkfjakl{fher}  fdsfj\" here {  {tok {";
+    while((str = next_token(str)))
+        switch(tok.sym) {
+        case LPAREN: printf("{ "); break;
+        case RPAREN: printf("} "); break;
+        case STRING: printf("\"%s\" ", tok.str); free(tok.str); break;
+        case IDEN:   printf("'%s' ", tok.iden); free(tok.iden); break;
+        case NUM:    printf("%d ", tok.i); break;
+        }
+    
+    printf("\n");
     return 0;
 }
-
-#endif
author	kartofen <mladenovnasko0@gmail.com>	2025-07-08 00:21:07 +0300
committer	kartofen <mladenovnasko0@gmail.com>	2025-07-08 00:21:07 +0300
commit	919611902c39fd70afe1162883ee6bfd34f2642e (patch)
tree	983528d33ed57e3ebc7792971507e6f9fff383ff /lexer.c
parent	a6cb97af1a2a5491f54fcfa7064641dc0aafd898 (diff)