aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkartofen <mladenovnasko0@gmail.com>2025-07-08 00:21:07 +0300
committerkartofen <mladenovnasko0@gmail.com>2025-07-08 00:21:07 +0300
commit919611902c39fd70afe1162883ee6bfd34f2642e (patch)
tree983528d33ed57e3ebc7792971507e6f9fff383ff
parenta6cb97af1a2a5491f54fcfa7064641dc0aafd898 (diff)
genearate parser options and lexer and dect separation
-rwxr-xr-xbuild.sh8
-rw-r--r--clr-table.c2
-rw-r--r--demos/generate-parser.c66
-rw-r--r--dict.c188
-rw-r--r--lexer.c272
-rw-r--r--parts/toklist.h9
6 files changed, 309 insertions, 236 deletions
diff --git a/build.sh b/build.sh
index 0e73e1e..91c7fdc 100755
--- a/build.sh
+++ b/build.sh
@@ -27,7 +27,8 @@ function leak
log valgrind --leak-check=full --show-leak-kinds=all -s bin/$1 $2
}
-# cc lexer -D_LEXER_STANDALONE
+cc dict -D_DICT_STANDALONE
+cc lexer -D_LEXER_STANDALONE
# cc recursive/recursive-ascent
# cc recursive/recursive-ascent-descent
# cc util-tables -D_UTIL_TABLES_STANDALONE
@@ -38,7 +39,8 @@ function leak
# cc demos/instant-parser
-# leak lexer
+leak lexer
+leak dict
# leak recursive-ascent
# leak recursive-ascent-descent
# leak util-tables
@@ -59,6 +61,6 @@ shared slr-table
shared clr-table
shared clr-table -D_LAZY_LALR lalr-table
-leak "generate-parser -t slr-table bin/arithmetic-defs.so" > bin/generated.c
+leak "generate-parser -t lalr-table bin/arithmetic-defs.so"
cc demos/sample-files/parser-skeleton "" parser # this includes bin/generated.c
leak parser "0-1+(1+0)-1+0"
diff --git a/clr-table.c b/clr-table.c
index 001a45c..892facf 100644
--- a/clr-table.c
+++ b/clr-table.c
@@ -90,7 +90,7 @@ static size_t itemset_handle(struct item *set, size_t nset)
if(!item_core_eq(&seen_sets[i].items[k], &set[j])) _same_core = 0;
if(!_same_core) break;
}
- if(_same_core) { use_state = seen_sets[i].state; break; }
+ if(_same_core) { use_state = seen_sets[i].state; /*break;*/ }
#endif
}
diff --git a/demos/generate-parser.c b/demos/generate-parser.c
index 5766223..7856db6 100644
--- a/demos/generate-parser.c
+++ b/demos/generate-parser.c
@@ -1,9 +1,13 @@
#include <stdio.h>
#include <stdlib.h>
#include <dlfcn.h>
+#include <string.h>
+#include <unistd.h> // getopt
+#include <assert.h>
#define DEFUALT_PATH "./bin"
#define DEFUALT_TYPE "lalr-table"
+#define DEFAULT_OUTPUT "bin/a"
#include "parts/symbol.h"
size_t total_symbols;
@@ -43,32 +47,52 @@ void *xdlsym(void *handle, char *sym)
char *modpath(char *name)
{
static char fullpath[128];
- char *path = getenv("GENERATE_PARSER_PATH");
- if(!path) path = DEFUALT_PATH;
- snprintf(fullpath, 128, "%s/%s.so", path, name);
+ // TODO: search the GENERATE_PARSER_PATH env var
+ char *path = DEFUALT_PATH;
+
+ assert(snprintf(fullpath, 128, "%s/%s.so", path, name) < 128);
return fullpath;
}
+char *add_extension(char *str, char *ext)
+{
+ static char full[128];
+ assert((strlen(str) + strlen(ext) + 1) <= 128);
+ memcpy(full, str, strlen(str)+1);
+ return strcat(full, ext);
+}
+
+void set_stdout(char *filename)
+{
+ if(!filename) filename = "/dev/tty";
+ assert(freopen(filename, "a+", stdout));
+}
+
int main(int argc, char **argv)
{
- if(argc < 2) return -1;
-
- void *table_handle;
- void *def_handle;
-
- if(argc == 2) {
- table_handle = dlopen(modpath(DEFUALT_TYPE), RTLD_LAZY);
- if(!table_handle) { fputs(dlerror(), stderr); return 1; }
- def_handle = dlopen(argv[1], RTLD_LAZY);
- if(!def_handle) { fputs(dlerror(), stderr); return 1; }
- } else if(argc == 4 &&
- argv[1][0] == '-' && argv[1][1] == 't') {
- table_handle = dlopen(modpath(argv[2]), RTLD_LAZY);
- if(!table_handle) { fputs(dlerror(), stderr); return 1; }
- def_handle = dlopen(argv[3], RTLD_LAZY);
- if(!def_handle) { fputs(dlerror(), stderr); return 1; }
- } else return -1;
+ char *type = DEFUALT_TYPE;
+ char *output_path = DEFAULT_OUTPUT;
+
+ int opt;
+ while((opt = getopt(argc, argv, "t:o:")) != -1) {
+ switch(opt) {
+ case 't': type = optarg; break;
+ case 'o': output_path = optarg; break;
+ default: fprintf(stderr, "ERROR: Unknown options '%c'\n", optopt);
+ return 1;
+ }
+ }
+
+ if(optind == argc) {
+ fprintf(stderr, "ERROR: No input file\n");
+ return 1;
+ }
+
+ void *table_handle = dlopen(modpath(type), RTLD_LAZY);
+ if(!table_handle) { fputs(dlerror(), stderr); return 1; }
+ void *def_handle = dlopen(argv[optind], RTLD_LAZY);
+ if(!def_handle) { fputs(dlerror(), stderr); return 1; }
GET_VARIABLE(table, table_handle);
GET_VARIABLE(table_states, table_handle);
@@ -91,6 +115,7 @@ int main(int argc, char **argv)
goto cleanup;
}
+ set_stdout(add_extension(output_path, ".c"));
printf("size_t total_symbols = %zu;\n", total_symbols);
printf("IMPLEMENT_FUNCPTR(int, symbol_is_valid, (symbol s)) {return s < total_symbols;}\n");
@@ -121,6 +146,7 @@ int main(int argc, char **argv)
for(size_t i = 0; i < total_productions; i++)
printf("__prod%zu_action, ", i);
printf("};");
+ set_stdout(NULL);
cleanup:
table_free();
diff --git a/dict.c b/dict.c
new file mode 100644
index 0000000..45ddf38
--- /dev/null
+++ b/dict.c
@@ -0,0 +1,188 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+extern const struct string_token {
+ char *s;
+ int t;
+} strings[];
+extern const size_t nstrings;
+
+extern const uint8_t char_to_bit[];
+
+struct level {
+ uint64_t bit_mask;
+ uint64_t *token_masks;
+};
+
+#define MAPPED_CHARS 32
+static struct level start_level = {0};
+static struct level *bit_to_ptr[MAPPED_CHARS] = {0};
+static size_t num_levels;
+
+#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
+#define popcount(x) (__builtin_popcount(x))
+
+int dict_compile(void)
+{
+ // max number of levels
+ for(size_t i = 0; i < nstrings; i++)
+ if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
+
+ // allocated levels
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i]));
+ if(!bit_to_ptr[i]) return 1;
+ }
+
+ // BUG: everything is repeated for the start_level
+
+ // populate bit_masks
+ for(size_t i = 0; i < nstrings; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+ uint8_t bit = char_to_bit[strings[i].s[j]];
+
+ l->bit_mask |= 1 << bit;
+ l = &bit_to_ptr[bit][j];
+ }
+ }
+
+ // allocate token_masks
+ // NOTE: start_level alloc'd many times, so realloc is used
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < num_levels + 1; j++) {
+ l->token_masks = realloc(l->token_masks, popcount(l->bit_mask)
+ * sizeof(*l->token_masks));
+ memset(l->token_masks, 0, popcount(l->bit_mask)
+ * sizeof(*l->token_masks));
+
+ l = &bit_to_ptr[i][j];
+ }
+ }
+
+ // populate token_masks
+ for(size_t i = 0; i < nstrings; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+ uint8_t bit = char_to_bit[strings[i].s[j]];
+ uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+
+ l->token_masks[idx] |= 1 << strings[i].t;
+ l = &bit_to_ptr[bit][j];
+ }
+ }
+
+ return 0;
+}
+
+void dict_print(void)
+{
+ for(size_t i = 0; i < 256; i++)
+ for(size_t j = 0; j < num_levels; j++)
+ if(CHAR_TO_PTR(i)[j].bit_mask) {
+ printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
+
+ printf("{ ");
+ for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
+ printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]);
+ printf(" }\n");
+ }
+
+
+ printf(" %32lb ", start_level.bit_mask);
+ printf("{ ");
+ for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
+ printf("%lb ", start_level.token_masks[k]);
+ printf(" }\n");
+
+ printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
+}
+
+void dict_free(void)
+{
+ free(start_level.token_masks);
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ for(size_t j = 0; j < num_levels; j++) {
+ if(bit_to_ptr[i][j].token_masks)
+ free(bit_to_ptr[i][j].token_masks);
+ }
+ free(bit_to_ptr[i]);
+ }
+}
+
+int dict_check(char *string)
+{
+ uint64_t token_mask = ~(uint64_t)0;
+
+ for(size_t i = 0; i < strlen(string) + 1; i++) {
+ struct level *l = (i == 0)
+ ? &start_level
+ : &bit_to_ptr[char_to_bit[string[i-1]]][i-1];
+
+ uint8_t bit = char_to_bit[string[i]];
+
+ if((l->bit_mask & (1 << bit)) == 0) return -1;
+
+ uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+ token_mask &= l->token_masks[idx];
+ }
+
+ if(token_mask) return __builtin_ctz(token_mask);
+ else return -1;
+}
+
+#ifdef _DICT_STANDALONE
+
+#define TOKENS(X) \
+ X(TOKEN_NONE) \
+ X(TOKEN_TEST) \
+ X(TOKEN_RETARDED) \
+ X(TOKEN_WOW) \
+ X(TOKEN_TITS) \
+ X(TOKEN_RPAREN) \
+ X(TOKEN_LPAREN)
+
+#define TOKEN_ENUM(a) a,
+#define TOKEN_STRING(a) #a,
+
+enum token {
+ TOKENS(TOKEN_ENUM)
+};
+
+const char * const token_to_string[] = {
+ TOKENS(TOKEN_STRING)
+};
+const struct string_token strings[] = {
+ {"test", TOKEN_TEST},
+ {"retarded", TOKEN_RETARDED},
+ {"wow", TOKEN_WOW},
+ {"tits", TOKEN_TITS},
+};
+
+const size_t nstrings = 4;
+
+const uint8_t char_to_bit[256] = {
+ ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7,
+ ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
+ ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19,
+ ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25,
+ ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1
+};
+
+int main(void)
+{
+ dict_compile();
+
+ int t;
+ if((t = dict_check("tits")) >= 0) printf("%s\n", token_to_string[t]);
+ if((t = dict_check("retarded")) >= 0) printf("%s\n", token_to_string[t]);
+ if((t = dict_check("test2")) >= 0) printf("%s\n", token_to_string[t]);
+ if((t = dict_check("tes")) >= 0) printf("%s\n", token_to_string[t]);
+
+ dict_free();
+}
+
+#endif
diff --git a/lexer.c b/lexer.c
index 2fca9a7..7ebf2e7 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,233 +1,85 @@
#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
#include <stdlib.h>
-#include <assert.h>
-
-// TODO: - make it more memory efficient by allocating only
-// the need amount of level for each letter
-// - add more than 64 bits "types" for more tokens
-// and more characters
-// - add easier way to write chars to bits (maybe a singe string)
-
-#define ARR_LEN(arr) (sizeof(arr) / sizeof(*arr))
-
-typedef int token;
-extern const char *const token_to_string[];
-extern const struct string_token { char *s; token t;} strings[];
-extern const token separators[];
-
-#ifdef _LEXER_STANDALONE
-
-#define TOKENS(X) \
- X(TOKEN_NONE) \
- X(TOKEN_TEST) \
- X(TOKEN_RETARDED) \
- X(TOKEN_WOW) \
- X(TOKEN_TITS) \
- X(TOKEN_RPAREN) \
- X(TOKEN_LPAREN)
-
-#define TOKEN_ENUM(a) a,
-#define TOKEN_STRING(a) #a,
-
-enum token {
- TOKENS(TOKEN_ENUM)
-};
-
-const char * const token_to_string[] = {
- TOKENS(TOKEN_STRING)
-};
-
-const struct string_token {
- char *s;
- token t;
-} strings[] = {
- {"test", TOKEN_TEST},
- {"retarded", TOKEN_RETARDED},
- {"wow", TOKEN_WOW},
- {"tits", TOKEN_TITS},
-};
-
-const token separators[] = {['{'] = TOKEN_LPAREN, ['}'] = TOKEN_RPAREN};
-
-#endif
-
-const uint8_t char_to_bit[256] = {
- ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7,
- ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
- ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19,
- ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25,
- ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1
-};
+#include <ctype.h>
+#include <string.h>
-struct level {
- uint64_t bit_mask;
- uint64_t *token_masks;
+struct token {
+ enum symbol {
+ LPAREN, RPAREN, STRING, IDEN, NUM
+ } sym;
+
+ union {
+ char *iden;
+ int i;
+ char *str;
+ };
};
-#define MAPPED_CHARS 32
-static struct level start_level = {0};
-static struct level *bit_to_ptr[MAPPED_CHARS] = {0};
-static size_t num_levels;
-
-#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
-
-#define popcount(x) (__builtin_popcount(x))
+static struct token tok;
-int compile_lextables(void)
+static inline int issep(char c)
{
- // max number of levels
- for(size_t i = 0; i < ARR_LEN(strings); i++)
- if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
-
- // allocated levels
- for(size_t i = 0; i < MAPPED_CHARS; i++) {
- bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i]));
- if(!bit_to_ptr[i]) return 1;
- }
-
- // BUG: everything is repeated for the start_level
-
- // populate bit_masks
- for(size_t i = 0; i < ARR_LEN(strings); i++) {
- struct level *l = &start_level;
- for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
- uint8_t bit = char_to_bit[strings[i].s[j]];
-
- l->bit_mask |= 1 << bit;
- l = &bit_to_ptr[bit][j];
- }
- }
-
- // allocate token_masks
- // NOTE: start_level alloc'd many times, so realloc is used
- for(size_t i = 0; i < MAPPED_CHARS; i++) {
- struct level *l = &start_level;
- for(size_t j = 0; j < num_levels + 1; j++) {
- l->token_masks = realloc(l->token_masks, popcount(l->bit_mask)
- * sizeof(*l->token_masks));
- memset(l->token_masks, 0, popcount(l->bit_mask)
- * sizeof(*l->token_masks));
-
- l = &bit_to_ptr[i][j];
- }
- }
-
- // populate token_masks
- for(size_t i = 0; i < ARR_LEN(strings); i++) {
- struct level *l = &start_level;
- for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
- uint8_t bit = char_to_bit[strings[i].s[j]];
- uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
-
- l->token_masks[idx] |= 1 << strings[i].t;
- l = &bit_to_ptr[bit][j];
- }
- }
-
- return 0;
+ return isspace(c) || c == '\0' || c == '}' || c == '{' || c == '"';
}
-
-void print_lextables(void)
+
+static inline int tillsep(char *str)
{
- for(size_t i = 0; i < 256; i++)
- for(size_t j = 0; j < num_levels; j++)
- if(CHAR_TO_PTR(i)[j].bit_mask) {
- printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
-
- printf("{ ");
- for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
- printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]);
- printf(" }\n");
- }
-
-
- printf(" %32lb ", start_level.bit_mask);
- printf("{ ");
- for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
- printf("%lb ", start_level.token_masks[k]);
- printf(" }\n");
-
- printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
+ size_t i = 0;
+ while(!issep(str[i++]));
+ return i-1;
}
-int tokenize_string(char *string, token *t, size_t t_len)
+static inline char *substring(char *str, size_t sub_end)
{
- size_t ntokens = 0;
- size_t i = 0;
- size_t off = 0;
-
- while(i < strlen(string) + 1) {
- uint64_t token_mask = ~(uint64_t)0;
- while(1) {
- struct level *l = (i-off == 0)
- ? &start_level
- : &bit_to_ptr[char_to_bit[string[i-1]]][i-1-off];
-
- uint8_t bit = (separators[string[i]])
- ? 1
- : char_to_bit[string[i]];
-
- if((l->bit_mask & (1 << bit)) == 0) {
- token_mask = 0;
- while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++;
- break;
- }
-
- uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
- token_mask &= l->token_masks[idx];
-
- if(bit == 1) break;
- i++;
- }
-
- // BUG: not checking of ntokens is in t_len
- if(token_mask)
- t[ntokens++] = __builtin_ctz(token_mask);
- else if(off != i) t[ntokens++] = TOKEN_NONE;
-
- if(separators[string[i]])
- t[ntokens++] = separators[string[i]];
-
- off = ++i;
- }
-
- return ntokens;
+ static char sub[128];
+ if(sub_end+1 > sizeof(sub)) return NULL;
+
+ sub[sub_end+1] = '\0';
+ return memcpy(sub, str, sub_end);
}
-void free_lextables(void)
+static char *next_token(char *str)
{
- free(start_level.token_masks);
- for(size_t i = 0; i < MAPPED_CHARS; i++) {
- for(size_t j = 0; j < num_levels; j++) {
- if(bit_to_ptr[i][j].token_masks)
- free(bit_to_ptr[i][j].token_masks);
+ size_t off = 0;
+ char c0 = str[0];
+
+ if(c0 == '\0') return NULL;
+ if(isspace(c0)) return next_token(str+1);
+ else {
+ off = tillsep(str);
+ if(off == 0) { // sep
+ switch(str[off++]) {
+ case '{': tok.sym = LPAREN; break;
+ case '}': tok.sym = RPAREN; break;
+ case '"':
+ while(str[off] != '"') if(str[off++] == '\0') return NULL;
+ tok.sym = STRING;
+ tok.str = strdup(substring(str, off));
+ }
+ } else if(isalpha(c0)) { // iden
+ tok.sym = IDEN;
+ tok.iden = strdup(substring(str, off));
+ } else if(c0 >= '0' && c0 <= '9') { // num
+ tok.sym = NUM;
+ tok.i = atoi(substring(str, off));
}
- free(bit_to_ptr[i]);
}
+
+ return str+off;
}
-#ifdef _LEXER_STANDALONE
-
-// exit with a given code, default is 1
-#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1)
-
int main(void)
{
- compile_lextables() && DIE();
-
- token t[120] = {0};
- size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120);
-
- ntokens || DIE(10);
-
- for(size_t i = 0; i < ntokens; i++)
- printf("%s\n", token_to_string[t[i]]);
-
- print_lextables();
- free_lextables();
+ char *str = "blah 0 1 443 test{here}13}{1\"fdlkfjakl{fher} fdsfj\" here { {tok {";
+ while((str = next_token(str)))
+ switch(tok.sym) {
+ case LPAREN: printf("{ "); break;
+ case RPAREN: printf("} "); break;
+ case STRING: printf("\"%s\" ", tok.str); free(tok.str); break;
+ case IDEN: printf("'%s' ", tok.iden); free(tok.iden); break;
+ case NUM: printf("%d ", tok.i); break;
+ }
+
+ printf("\n");
return 0;
}
-
-#endif
diff --git a/parts/toklist.h b/parts/toklist.h
index a6fe68d..f32cd25 100644
--- a/parts/toklist.h
+++ b/parts/toklist.h
@@ -3,7 +3,12 @@
#include "symbol.h"
-/*extern*/ symbol toklist_eat();
-/*extern*/ symbol toklist_peek();
+struct token;
+
+// /*extern*/ struct token *toklist_eat();
+// /*extern*/ struct token *toklist_peek();
+
+symbol toklist_eat();
+symbol toklist_peek();
#endif