diff options
author | kartofen <mladenovnasko0@gmail.com> | 2025-07-08 00:21:07 +0300 |
---|---|---|
committer | kartofen <mladenovnasko0@gmail.com> | 2025-07-08 00:21:07 +0300 |
commit | 919611902c39fd70afe1162883ee6bfd34f2642e (patch) | |
tree | 983528d33ed57e3ebc7792971507e6f9fff383ff | |
parent | a6cb97af1a2a5491f54fcfa7064641dc0aafd898 (diff) |
genearate parser options and lexer and dect separation
-rwxr-xr-x | build.sh | 8 | ||||
-rw-r--r-- | clr-table.c | 2 | ||||
-rw-r--r-- | demos/generate-parser.c | 66 | ||||
-rw-r--r-- | dict.c | 188 | ||||
-rw-r--r-- | lexer.c | 272 | ||||
-rw-r--r-- | parts/toklist.h | 9 |
6 files changed, 309 insertions, 236 deletions
@@ -27,7 +27,8 @@ function leak log valgrind --leak-check=full --show-leak-kinds=all -s bin/$1 $2 } -# cc lexer -D_LEXER_STANDALONE +cc dict -D_DICT_STANDALONE +cc lexer -D_LEXER_STANDALONE # cc recursive/recursive-ascent # cc recursive/recursive-ascent-descent # cc util-tables -D_UTIL_TABLES_STANDALONE @@ -38,7 +39,8 @@ function leak # cc demos/instant-parser -# leak lexer +leak lexer +leak dict # leak recursive-ascent # leak recursive-ascent-descent # leak util-tables @@ -59,6 +61,6 @@ shared slr-table shared clr-table shared clr-table -D_LAZY_LALR lalr-table -leak "generate-parser -t slr-table bin/arithmetic-defs.so" > bin/generated.c +leak "generate-parser -t lalr-table bin/arithmetic-defs.so" cc demos/sample-files/parser-skeleton "" parser # this includes bin/generated.c leak parser "0-1+(1+0)-1+0" diff --git a/clr-table.c b/clr-table.c index 001a45c..892facf 100644 --- a/clr-table.c +++ b/clr-table.c @@ -90,7 +90,7 @@ static size_t itemset_handle(struct item *set, size_t nset) if(!item_core_eq(&seen_sets[i].items[k], &set[j])) _same_core = 0; if(!_same_core) break; } - if(_same_core) { use_state = seen_sets[i].state; break; } + if(_same_core) { use_state = seen_sets[i].state; /*break;*/ } #endif } diff --git a/demos/generate-parser.c b/demos/generate-parser.c index 5766223..7856db6 100644 --- a/demos/generate-parser.c +++ b/demos/generate-parser.c @@ -1,9 +1,13 @@ #include <stdio.h> #include <stdlib.h> #include <dlfcn.h> +#include <string.h> +#include <unistd.h> // getopt +#include <assert.h> #define DEFUALT_PATH "./bin" #define DEFUALT_TYPE "lalr-table" +#define DEFAULT_OUTPUT "bin/a" #include "parts/symbol.h" size_t total_symbols; @@ -43,32 +47,52 @@ void *xdlsym(void *handle, char *sym) char *modpath(char *name) { static char fullpath[128]; - char *path = getenv("GENERATE_PARSER_PATH"); - if(!path) path = DEFUALT_PATH; - snprintf(fullpath, 128, "%s/%s.so", path, name); + // TODO: search the GENERATE_PARSER_PATH env var + char *path = DEFUALT_PATH; + + assert(snprintf(fullpath, 128, "%s/%s.so", path, name) < 128); return fullpath; } +char *add_extension(char *str, char *ext) +{ + static char full[128]; + assert((strlen(str) + strlen(ext) + 1) <= 128); + memcpy(full, str, strlen(str)+1); + return strcat(full, ext); +} + +void set_stdout(char *filename) +{ + if(!filename) filename = "/dev/tty"; + assert(freopen(filename, "a+", stdout)); +} + int main(int argc, char **argv) { - if(argc < 2) return -1; - - void *table_handle; - void *def_handle; - - if(argc == 2) { - table_handle = dlopen(modpath(DEFUALT_TYPE), RTLD_LAZY); - if(!table_handle) { fputs(dlerror(), stderr); return 1; } - def_handle = dlopen(argv[1], RTLD_LAZY); - if(!def_handle) { fputs(dlerror(), stderr); return 1; } - } else if(argc == 4 && - argv[1][0] == '-' && argv[1][1] == 't') { - table_handle = dlopen(modpath(argv[2]), RTLD_LAZY); - if(!table_handle) { fputs(dlerror(), stderr); return 1; } - def_handle = dlopen(argv[3], RTLD_LAZY); - if(!def_handle) { fputs(dlerror(), stderr); return 1; } - } else return -1; + char *type = DEFUALT_TYPE; + char *output_path = DEFAULT_OUTPUT; + + int opt; + while((opt = getopt(argc, argv, "t:o:")) != -1) { + switch(opt) { + case 't': type = optarg; break; + case 'o': output_path = optarg; break; + default: fprintf(stderr, "ERROR: Unknown options '%c'\n", optopt); + return 1; + } + } + + if(optind == argc) { + fprintf(stderr, "ERROR: No input file\n"); + return 1; + } + + void *table_handle = dlopen(modpath(type), RTLD_LAZY); + if(!table_handle) { fputs(dlerror(), stderr); return 1; } + void *def_handle = dlopen(argv[optind], RTLD_LAZY); + if(!def_handle) { fputs(dlerror(), stderr); return 1; } GET_VARIABLE(table, table_handle); GET_VARIABLE(table_states, table_handle); @@ -91,6 +115,7 @@ int main(int argc, char **argv) goto cleanup; } + set_stdout(add_extension(output_path, ".c")); printf("size_t total_symbols = %zu;\n", total_symbols); printf("IMPLEMENT_FUNCPTR(int, symbol_is_valid, (symbol s)) {return s < total_symbols;}\n"); @@ -121,6 +146,7 @@ int main(int argc, char **argv) for(size_t i = 0; i < total_productions; i++) printf("__prod%zu_action, ", i); printf("};"); + set_stdout(NULL); cleanup: table_free(); @@ -0,0 +1,188 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> + +extern const struct string_token { + char *s; + int t; +} strings[]; +extern const size_t nstrings; + +extern const uint8_t char_to_bit[]; + +struct level { + uint64_t bit_mask; + uint64_t *token_masks; +}; + +#define MAPPED_CHARS 32 +static struct level start_level = {0}; +static struct level *bit_to_ptr[MAPPED_CHARS] = {0}; +static size_t num_levels; + +#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) +#define popcount(x) (__builtin_popcount(x)) + +int dict_compile(void) +{ + // max number of levels + for(size_t i = 0; i < nstrings; i++) + if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); + + // allocated levels + for(size_t i = 0; i < MAPPED_CHARS; i++) { + bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i])); + if(!bit_to_ptr[i]) return 1; + } + + // BUG: everything is repeated for the start_level + + // populate bit_masks + for(size_t i = 0; i < nstrings; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { + uint8_t bit = char_to_bit[strings[i].s[j]]; + + l->bit_mask |= 1 << bit; + l = &bit_to_ptr[bit][j]; + } + } + + // allocate token_masks + // NOTE: start_level alloc'd many times, so realloc is used + for(size_t i = 0; i < MAPPED_CHARS; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < num_levels + 1; j++) { + l->token_masks = realloc(l->token_masks, popcount(l->bit_mask) + * sizeof(*l->token_masks)); + memset(l->token_masks, 0, popcount(l->bit_mask) + * sizeof(*l->token_masks)); + + l = &bit_to_ptr[i][j]; + } + } + + // populate token_masks + for(size_t i = 0; i < nstrings; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { + uint8_t bit = char_to_bit[strings[i].s[j]]; + uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); + + l->token_masks[idx] |= 1 << strings[i].t; + l = &bit_to_ptr[bit][j]; + } + } + + return 0; +} + +void dict_print(void) +{ + for(size_t i = 0; i < 256; i++) + for(size_t j = 0; j < num_levels; j++) + if(CHAR_TO_PTR(i)[j].bit_mask) { + printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); + + printf("{ "); + for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) + printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]); + printf(" }\n"); + } + + + printf(" %32lb ", start_level.bit_mask); + printf("{ "); + for(size_t k = 0; k < popcount(start_level.bit_mask); k++) + printf("%lb ", start_level.token_masks[k]); + printf(" }\n"); + + printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); +} + +void dict_free(void) +{ + free(start_level.token_masks); + for(size_t i = 0; i < MAPPED_CHARS; i++) { + for(size_t j = 0; j < num_levels; j++) { + if(bit_to_ptr[i][j].token_masks) + free(bit_to_ptr[i][j].token_masks); + } + free(bit_to_ptr[i]); + } +} + +int dict_check(char *string) +{ + uint64_t token_mask = ~(uint64_t)0; + + for(size_t i = 0; i < strlen(string) + 1; i++) { + struct level *l = (i == 0) + ? &start_level + : &bit_to_ptr[char_to_bit[string[i-1]]][i-1]; + + uint8_t bit = char_to_bit[string[i]]; + + if((l->bit_mask & (1 << bit)) == 0) return -1; + + uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); + token_mask &= l->token_masks[idx]; + } + + if(token_mask) return __builtin_ctz(token_mask); + else return -1; +} + +#ifdef _DICT_STANDALONE + +#define TOKENS(X) \ + X(TOKEN_NONE) \ + X(TOKEN_TEST) \ + X(TOKEN_RETARDED) \ + X(TOKEN_WOW) \ + X(TOKEN_TITS) \ + X(TOKEN_RPAREN) \ + X(TOKEN_LPAREN) + +#define TOKEN_ENUM(a) a, +#define TOKEN_STRING(a) #a, + +enum token { + TOKENS(TOKEN_ENUM) +}; + +const char * const token_to_string[] = { + TOKENS(TOKEN_STRING) +}; +const struct string_token strings[] = { + {"test", TOKEN_TEST}, + {"retarded", TOKEN_RETARDED}, + {"wow", TOKEN_WOW}, + {"tits", TOKEN_TITS}, +}; + +const size_t nstrings = 4; + +const uint8_t char_to_bit[256] = { + ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, + ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, + ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19, + ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25, + ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 +}; + +int main(void) +{ + dict_compile(); + + int t; + if((t = dict_check("tits")) >= 0) printf("%s\n", token_to_string[t]); + if((t = dict_check("retarded")) >= 0) printf("%s\n", token_to_string[t]); + if((t = dict_check("test2")) >= 0) printf("%s\n", token_to_string[t]); + if((t = dict_check("tes")) >= 0) printf("%s\n", token_to_string[t]); + + dict_free(); +} + +#endif @@ -1,233 +1,85 @@ #include <stdio.h> -#include <stdint.h> -#include <string.h> #include <stdlib.h> -#include <assert.h> - -// TODO: - make it more memory efficient by allocating only -// the need amount of level for each letter -// - add more than 64 bits "types" for more tokens -// and more characters -// - add easier way to write chars to bits (maybe a singe string) - -#define ARR_LEN(arr) (sizeof(arr) / sizeof(*arr)) - -typedef int token; -extern const char *const token_to_string[]; -extern const struct string_token { char *s; token t;} strings[]; -extern const token separators[]; - -#ifdef _LEXER_STANDALONE - -#define TOKENS(X) \ - X(TOKEN_NONE) \ - X(TOKEN_TEST) \ - X(TOKEN_RETARDED) \ - X(TOKEN_WOW) \ - X(TOKEN_TITS) \ - X(TOKEN_RPAREN) \ - X(TOKEN_LPAREN) - -#define TOKEN_ENUM(a) a, -#define TOKEN_STRING(a) #a, - -enum token { - TOKENS(TOKEN_ENUM) -}; - -const char * const token_to_string[] = { - TOKENS(TOKEN_STRING) -}; - -const struct string_token { - char *s; - token t; -} strings[] = { - {"test", TOKEN_TEST}, - {"retarded", TOKEN_RETARDED}, - {"wow", TOKEN_WOW}, - {"tits", TOKEN_TITS}, -}; - -const token separators[] = {['{'] = TOKEN_LPAREN, ['}'] = TOKEN_RPAREN}; - -#endif - -const uint8_t char_to_bit[256] = { - ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, - ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, - ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19, - ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25, - ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 -}; +#include <ctype.h> +#include <string.h> -struct level { - uint64_t bit_mask; - uint64_t *token_masks; +struct token { + enum symbol { + LPAREN, RPAREN, STRING, IDEN, NUM + } sym; + + union { + char *iden; + int i; + char *str; + }; }; -#define MAPPED_CHARS 32 -static struct level start_level = {0}; -static struct level *bit_to_ptr[MAPPED_CHARS] = {0}; -static size_t num_levels; - -#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) - -#define popcount(x) (__builtin_popcount(x)) +static struct token tok; -int compile_lextables(void) +static inline int issep(char c) { - // max number of levels - for(size_t i = 0; i < ARR_LEN(strings); i++) - if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); - - // allocated levels - for(size_t i = 0; i < MAPPED_CHARS; i++) { - bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i])); - if(!bit_to_ptr[i]) return 1; - } - - // BUG: everything is repeated for the start_level - - // populate bit_masks - for(size_t i = 0; i < ARR_LEN(strings); i++) { - struct level *l = &start_level; - for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { - uint8_t bit = char_to_bit[strings[i].s[j]]; - - l->bit_mask |= 1 << bit; - l = &bit_to_ptr[bit][j]; - } - } - - // allocate token_masks - // NOTE: start_level alloc'd many times, so realloc is used - for(size_t i = 0; i < MAPPED_CHARS; i++) { - struct level *l = &start_level; - for(size_t j = 0; j < num_levels + 1; j++) { - l->token_masks = realloc(l->token_masks, popcount(l->bit_mask) - * sizeof(*l->token_masks)); - memset(l->token_masks, 0, popcount(l->bit_mask) - * sizeof(*l->token_masks)); - - l = &bit_to_ptr[i][j]; - } - } - - // populate token_masks - for(size_t i = 0; i < ARR_LEN(strings); i++) { - struct level *l = &start_level; - for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { - uint8_t bit = char_to_bit[strings[i].s[j]]; - uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); - - l->token_masks[idx] |= 1 << strings[i].t; - l = &bit_to_ptr[bit][j]; - } - } - - return 0; + return isspace(c) || c == '\0' || c == '}' || c == '{' || c == '"'; } - -void print_lextables(void) + +static inline int tillsep(char *str) { - for(size_t i = 0; i < 256; i++) - for(size_t j = 0; j < num_levels; j++) - if(CHAR_TO_PTR(i)[j].bit_mask) { - printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); - - printf("{ "); - for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) - printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]); - printf(" }\n"); - } - - - printf(" %32lb ", start_level.bit_mask); - printf("{ "); - for(size_t k = 0; k < popcount(start_level.bit_mask); k++) - printf("%lb ", start_level.token_masks[k]); - printf(" }\n"); - - printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); + size_t i = 0; + while(!issep(str[i++])); + return i-1; } -int tokenize_string(char *string, token *t, size_t t_len) +static inline char *substring(char *str, size_t sub_end) { - size_t ntokens = 0; - size_t i = 0; - size_t off = 0; - - while(i < strlen(string) + 1) { - uint64_t token_mask = ~(uint64_t)0; - while(1) { - struct level *l = (i-off == 0) - ? &start_level - : &bit_to_ptr[char_to_bit[string[i-1]]][i-1-off]; - - uint8_t bit = (separators[string[i]]) - ? 1 - : char_to_bit[string[i]]; - - if((l->bit_mask & (1 << bit)) == 0) { - token_mask = 0; - while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++; - break; - } - - uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); - token_mask &= l->token_masks[idx]; - - if(bit == 1) break; - i++; - } - - // BUG: not checking of ntokens is in t_len - if(token_mask) - t[ntokens++] = __builtin_ctz(token_mask); - else if(off != i) t[ntokens++] = TOKEN_NONE; - - if(separators[string[i]]) - t[ntokens++] = separators[string[i]]; - - off = ++i; - } - - return ntokens; + static char sub[128]; + if(sub_end+1 > sizeof(sub)) return NULL; + + sub[sub_end+1] = '\0'; + return memcpy(sub, str, sub_end); } -void free_lextables(void) +static char *next_token(char *str) { - free(start_level.token_masks); - for(size_t i = 0; i < MAPPED_CHARS; i++) { - for(size_t j = 0; j < num_levels; j++) { - if(bit_to_ptr[i][j].token_masks) - free(bit_to_ptr[i][j].token_masks); + size_t off = 0; + char c0 = str[0]; + + if(c0 == '\0') return NULL; + if(isspace(c0)) return next_token(str+1); + else { + off = tillsep(str); + if(off == 0) { // sep + switch(str[off++]) { + case '{': tok.sym = LPAREN; break; + case '}': tok.sym = RPAREN; break; + case '"': + while(str[off] != '"') if(str[off++] == '\0') return NULL; + tok.sym = STRING; + tok.str = strdup(substring(str, off)); + } + } else if(isalpha(c0)) { // iden + tok.sym = IDEN; + tok.iden = strdup(substring(str, off)); + } else if(c0 >= '0' && c0 <= '9') { // num + tok.sym = NUM; + tok.i = atoi(substring(str, off)); } - free(bit_to_ptr[i]); } + + return str+off; } -#ifdef _LEXER_STANDALONE - -// exit with a given code, default is 1 -#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1) - int main(void) { - compile_lextables() && DIE(); - - token t[120] = {0}; - size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120); - - ntokens || DIE(10); - - for(size_t i = 0; i < ntokens; i++) - printf("%s\n", token_to_string[t[i]]); - - print_lextables(); - free_lextables(); + char *str = "blah 0 1 443 test{here}13}{1\"fdlkfjakl{fher} fdsfj\" here { {tok {"; + while((str = next_token(str))) + switch(tok.sym) { + case LPAREN: printf("{ "); break; + case RPAREN: printf("} "); break; + case STRING: printf("\"%s\" ", tok.str); free(tok.str); break; + case IDEN: printf("'%s' ", tok.iden); free(tok.iden); break; + case NUM: printf("%d ", tok.i); break; + } + + printf("\n"); return 0; } - -#endif diff --git a/parts/toklist.h b/parts/toklist.h index a6fe68d..f32cd25 100644 --- a/parts/toklist.h +++ b/parts/toklist.h @@ -3,7 +3,12 @@ #include "symbol.h" -/*extern*/ symbol toklist_eat(); -/*extern*/ symbol toklist_peek(); +struct token; + +// /*extern*/ struct token *toklist_eat(); +// /*extern*/ struct token *toklist_peek(); + +symbol toklist_eat(); +symbol toklist_peek(); #endif |