From e911e95f697b0eb48ed4e68cb2586ffb0dc11341 Mon Sep 17 00:00:00 2001 From: kartofen Date: Tue, 8 Jul 2025 19:14:02 +0300 Subject: move things around --- build.sh | 2 +- dict.c | 188 ----------------------------------------------------------- util/dict.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ util/dict.h | 33 +++++++++++ 4 files changed, 225 insertions(+), 189 deletions(-) delete mode 100644 dict.c create mode 100644 util/dict.c create mode 100644 util/dict.h diff --git a/build.sh b/build.sh index 9f1899b..8150542 100755 --- a/build.sh +++ b/build.sh @@ -27,7 +27,7 @@ function leak log valgrind --leak-check=full --show-leak-kinds=all -s bin/$1 $2 } -# cc dict -D_DICT_STANDALONE +# cc util/dict -D_DICT_STANDALONE # leak dict # cc demos/lexer -D_LEXER_STANDALONE diff --git a/dict.c b/dict.c deleted file mode 100644 index 45ddf38..0000000 --- a/dict.c +++ /dev/null @@ -1,188 +0,0 @@ -#include -#include -#include -#include - -extern const struct string_token { - char *s; - int t; -} strings[]; -extern const size_t nstrings; - -extern const uint8_t char_to_bit[]; - -struct level { - uint64_t bit_mask; - uint64_t *token_masks; -}; - -#define MAPPED_CHARS 32 -static struct level start_level = {0}; -static struct level *bit_to_ptr[MAPPED_CHARS] = {0}; -static size_t num_levels; - -#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) -#define popcount(x) (__builtin_popcount(x)) - -int dict_compile(void) -{ - // max number of levels - for(size_t i = 0; i < nstrings; i++) - if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); - - // allocated levels - for(size_t i = 0; i < MAPPED_CHARS; i++) { - bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i])); - if(!bit_to_ptr[i]) return 1; - } - - // BUG: everything is repeated for the start_level - - // populate bit_masks - for(size_t i = 0; i < nstrings; i++) { - struct level *l = &start_level; - for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { - uint8_t bit = char_to_bit[strings[i].s[j]]; - - l->bit_mask |= 1 << bit; - l = &bit_to_ptr[bit][j]; - } - } - - // allocate token_masks - // NOTE: start_level alloc'd many times, so realloc is used - for(size_t i = 0; i < MAPPED_CHARS; i++) { - struct level *l = &start_level; - for(size_t j = 0; j < num_levels + 1; j++) { - l->token_masks = realloc(l->token_masks, popcount(l->bit_mask) - * sizeof(*l->token_masks)); - memset(l->token_masks, 0, popcount(l->bit_mask) - * sizeof(*l->token_masks)); - - l = &bit_to_ptr[i][j]; - } - } - - // populate token_masks - for(size_t i = 0; i < nstrings; i++) { - struct level *l = &start_level; - for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { - uint8_t bit = char_to_bit[strings[i].s[j]]; - uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); - - l->token_masks[idx] |= 1 << strings[i].t; - l = &bit_to_ptr[bit][j]; - } - } - - return 0; -} - -void dict_print(void) -{ - for(size_t i = 0; i < 256; i++) - for(size_t j = 0; j < num_levels; j++) - if(CHAR_TO_PTR(i)[j].bit_mask) { - printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); - - printf("{ "); - for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) - printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]); - printf(" }\n"); - } - - - printf(" %32lb ", start_level.bit_mask); - printf("{ "); - for(size_t k = 0; k < popcount(start_level.bit_mask); k++) - printf("%lb ", start_level.token_masks[k]); - printf(" }\n"); - - printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); -} - -void dict_free(void) -{ - free(start_level.token_masks); - for(size_t i = 0; i < MAPPED_CHARS; i++) { - for(size_t j = 0; j < num_levels; j++) { - if(bit_to_ptr[i][j].token_masks) - free(bit_to_ptr[i][j].token_masks); - } - free(bit_to_ptr[i]); - } -} - -int dict_check(char *string) -{ - uint64_t token_mask = ~(uint64_t)0; - - for(size_t i = 0; i < strlen(string) + 1; i++) { - struct level *l = (i == 0) - ? &start_level - : &bit_to_ptr[char_to_bit[string[i-1]]][i-1]; - - uint8_t bit = char_to_bit[string[i]]; - - if((l->bit_mask & (1 << bit)) == 0) return -1; - - uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); - token_mask &= l->token_masks[idx]; - } - - if(token_mask) return __builtin_ctz(token_mask); - else return -1; -} - -#ifdef _DICT_STANDALONE - -#define TOKENS(X) \ - X(TOKEN_NONE) \ - X(TOKEN_TEST) \ - X(TOKEN_RETARDED) \ - X(TOKEN_WOW) \ - X(TOKEN_TITS) \ - X(TOKEN_RPAREN) \ - X(TOKEN_LPAREN) - -#define TOKEN_ENUM(a) a, -#define TOKEN_STRING(a) #a, - -enum token { - TOKENS(TOKEN_ENUM) -}; - -const char * const token_to_string[] = { - TOKENS(TOKEN_STRING) -}; -const struct string_token strings[] = { - {"test", TOKEN_TEST}, - {"retarded", TOKEN_RETARDED}, - {"wow", TOKEN_WOW}, - {"tits", TOKEN_TITS}, -}; - -const size_t nstrings = 4; - -const uint8_t char_to_bit[256] = { - ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, - ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, - ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19, - ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25, - ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 -}; - -int main(void) -{ - dict_compile(); - - int t; - if((t = dict_check("tits")) >= 0) printf("%s\n", token_to_string[t]); - if((t = dict_check("retarded")) >= 0) printf("%s\n", token_to_string[t]); - if((t = dict_check("test2")) >= 0) printf("%s\n", token_to_string[t]); - if((t = dict_check("tes")) >= 0) printf("%s\n", token_to_string[t]); - - dict_free(); -} - -#endif diff --git a/util/dict.c b/util/dict.c new file mode 100644 index 0000000..56ed6c9 --- /dev/null +++ b/util/dict.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include + +#include "util/dict.h" + +#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) +#define popcount(x) (__builtin_popcount(x)) + +#define start_level d->start_level +#define bit_to_ptr d->bit_to_ptr +#define num_levels d->num_levels +#define strings d->strings +#define nstrings d->nstrings +#define char_to_bit d->char_to_bit + +int dict_compile(struct dict *d) +{ + // max number of levels + for(size_t i = 0; i < nstrings; i++) + if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); + + // allocated levels + for(size_t i = 0; i < MAPPED_CHARS; i++) { + bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i])); + if(!bit_to_ptr[i]) return 1; + } + + // BUG: everything is repeated for the start_level + + // populate bit_masks + for(size_t i = 0; i < nstrings; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { + uint8_t bit = char_to_bit[strings[i].s[j]]; + + l->bit_mask |= 1 << bit; + l = &bit_to_ptr[bit][j]; + } + } + + // allocate token_masks + // NOTE: start_level alloc'd many times, so realloc is used + for(size_t i = 0; i < MAPPED_CHARS; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < num_levels + 1; j++) { + l->token_masks = realloc(l->token_masks, popcount(l->bit_mask) + * sizeof(*l->token_masks)); + memset(l->token_masks, 0, popcount(l->bit_mask) + * sizeof(*l->token_masks)); + + l = &bit_to_ptr[i][j]; + } + } + + // populate token_masks + for(size_t i = 0; i < nstrings; i++) { + struct level *l = &start_level; + for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { + uint8_t bit = char_to_bit[strings[i].s[j]]; + uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); + + l->token_masks[idx] |= 1 << strings[i].t; + l = &bit_to_ptr[bit][j]; + } + } + + return 0; +} + +void dict_print(struct dict *d) +{ + for(size_t i = 0; i < 256; i++) + for(size_t j = 0; j < num_levels; j++) + if(CHAR_TO_PTR(i)[j].bit_mask) { + printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); + + printf("{ "); + for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) + printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]); + printf(" }\n"); + } + + + printf(" %32lb ", start_level.bit_mask); + printf("{ "); + for(size_t k = 0; k < popcount(start_level.bit_mask); k++) + printf("%lb ", start_level.token_masks[k]); + printf(" }\n"); + + printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); +} + +void dict_free(struct dict *d) +{ + free(start_level.token_masks); + for(size_t i = 0; i < MAPPED_CHARS; i++) { + for(size_t j = 0; j < num_levels; j++) { + if(bit_to_ptr[i][j].token_masks) + free(bit_to_ptr[i][j].token_masks); + } + free(bit_to_ptr[i]); + } +} + +int dict_check(struct dict *d, char *string) +{ + uint64_t token_mask = ~(uint64_t)0; + + for(size_t i = 0; i < strlen(string) + 1; i++) { + struct level *l = (i == 0) + ? &start_level + : &bit_to_ptr[char_to_bit[string[i-1]]][i-1]; + + uint8_t bit = char_to_bit[string[i]]; + + if((l->bit_mask & (1 << bit)) == 0) return -1; + + uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); + token_mask &= l->token_masks[idx]; + } + + if(token_mask) return __builtin_ctz(token_mask); + else return -1; +} + +#undef start_level +#undef bit_to_ptr +#undef num_levels +#undef strings +#undef nstrings +#undef char_to_bit + +#ifdef _DICT_STANDALONE + +#define TOKENS(X) \ + X(TOKEN_NONE) \ + X(TOKEN_TEST) \ + X(TOKEN_RETARDED) \ + X(TOKEN_WOW) \ + X(TOKEN_TITS) \ + X(TOKEN_RPAREN) \ + X(TOKEN_LPAREN) + +#define TOKEN_ENUM(a) a, +#define TOKEN_STRING(a) #a, + +enum token { + TOKENS(TOKEN_ENUM) +}; + +const char * const token_to_string[] = { + TOKENS(TOKEN_STRING) +}; +struct string_token *strings = (struct string_token[]){ + {"test", TOKEN_TEST}, + {"retarded", TOKEN_RETARDED}, + {"wow", TOKEN_WOW}, + {"tits", TOKEN_TITS}, +}; + +const size_t nstrings = 4; + +const uint8_t char_to_bit[256] = { + ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, + ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, + ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19, + ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25, + ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 +}; + +int main(void) +{ + struct dict d = {0}; + d.strings = strings; + d.nstrings = nstrings; + d.char_to_bit = char_to_bit; + + dict_compile(&d); + + int t; + if((t = dict_check(&d, "tits")) >= 0) printf("%s\n", token_to_string[t]); + if((t = dict_check(&d, "retarded")) >= 0) printf("%s\n", token_to_string[t]); + if((t = dict_check(&d, "test2")) >= 0) printf("%s\n", token_to_string[t]); + if((t = dict_check(&d, "tes")) >= 0) printf("%s\n", token_to_string[t]); + + dict_free(&d); +} + +#endif diff --git a/util/dict.h b/util/dict.h new file mode 100644 index 0000000..ecaf53a --- /dev/null +++ b/util/dict.h @@ -0,0 +1,33 @@ +#ifndef DICT_H +#define DICT_H + +struct level { + uint64_t bit_mask; + uint64_t *token_masks; +}; + +#ifndef MAPPED_CHARS +#define MAPPED_CHARS 32 +#endif + +struct dict { + // parameters for compilation + struct string_token { + char *s; + int t; + } *strings; + size_t nstrings; + uint8_t *char_to_bit; + + // result of compilation + struct level start_level; + struct level *bit_to_ptr[MAPPED_CHARS]; + size_t num_levels; +}; + +int dict_compile(struct dict *d); +void dict_free(struct dict *d); +void dict_print(struct dict *d); +int dict_check(struct dict *d, char *string); + +#endif -- cgit v1.2.3