aboutsummaryrefslogtreecommitdiff
path: root/util/dict.c
diff options
context:
space:
mode:
Diffstat (limited to 'util/dict.c')
-rw-r--r--util/dict.c191
1 files changed, 191 insertions, 0 deletions
diff --git a/util/dict.c b/util/dict.c
new file mode 100644
index 0000000..56ed6c9
--- /dev/null
+++ b/util/dict.c
@@ -0,0 +1,191 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/dict.h"
+
+#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
+#define popcount(x) (__builtin_popcount(x))
+
+#define start_level d->start_level
+#define bit_to_ptr d->bit_to_ptr
+#define num_levels d->num_levels
+#define strings d->strings
+#define nstrings d->nstrings
+#define char_to_bit d->char_to_bit
+
+int dict_compile(struct dict *d)
+{
+ // max number of levels
+ for(size_t i = 0; i < nstrings; i++)
+ if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
+
+ // allocated levels
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i]));
+ if(!bit_to_ptr[i]) return 1;
+ }
+
+ // BUG: everything is repeated for the start_level
+
+ // populate bit_masks
+ for(size_t i = 0; i < nstrings; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+ uint8_t bit = char_to_bit[strings[i].s[j]];
+
+ l->bit_mask |= 1 << bit;
+ l = &bit_to_ptr[bit][j];
+ }
+ }
+
+ // allocate token_masks
+ // NOTE: start_level alloc'd many times, so realloc is used
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < num_levels + 1; j++) {
+ l->token_masks = realloc(l->token_masks, popcount(l->bit_mask)
+ * sizeof(*l->token_masks));
+ memset(l->token_masks, 0, popcount(l->bit_mask)
+ * sizeof(*l->token_masks));
+
+ l = &bit_to_ptr[i][j];
+ }
+ }
+
+ // populate token_masks
+ for(size_t i = 0; i < nstrings; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+ uint8_t bit = char_to_bit[strings[i].s[j]];
+ uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+
+ l->token_masks[idx] |= 1 << strings[i].t;
+ l = &bit_to_ptr[bit][j];
+ }
+ }
+
+ return 0;
+}
+
+void dict_print(struct dict *d)
+{
+ for(size_t i = 0; i < 256; i++)
+ for(size_t j = 0; j < num_levels; j++)
+ if(CHAR_TO_PTR(i)[j].bit_mask) {
+ printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
+
+ printf("{ ");
+ for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
+ printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]);
+ printf(" }\n");
+ }
+
+
+ printf(" %32lb ", start_level.bit_mask);
+ printf("{ ");
+ for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
+ printf("%lb ", start_level.token_masks[k]);
+ printf(" }\n");
+
+ printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
+}
+
+void dict_free(struct dict *d)
+{
+ free(start_level.token_masks);
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ for(size_t j = 0; j < num_levels; j++) {
+ if(bit_to_ptr[i][j].token_masks)
+ free(bit_to_ptr[i][j].token_masks);
+ }
+ free(bit_to_ptr[i]);
+ }
+}
+
+int dict_check(struct dict *d, char *string)
+{
+ uint64_t token_mask = ~(uint64_t)0;
+
+ for(size_t i = 0; i < strlen(string) + 1; i++) {
+ struct level *l = (i == 0)
+ ? &start_level
+ : &bit_to_ptr[char_to_bit[string[i-1]]][i-1];
+
+ uint8_t bit = char_to_bit[string[i]];
+
+ if((l->bit_mask & (1 << bit)) == 0) return -1;
+
+ uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+ token_mask &= l->token_masks[idx];
+ }
+
+ if(token_mask) return __builtin_ctz(token_mask);
+ else return -1;
+}
+
+#undef start_level
+#undef bit_to_ptr
+#undef num_levels
+#undef strings
+#undef nstrings
+#undef char_to_bit
+
+#ifdef _DICT_STANDALONE
+
+#define TOKENS(X) \
+ X(TOKEN_NONE) \
+ X(TOKEN_TEST) \
+ X(TOKEN_RETARDED) \
+ X(TOKEN_WOW) \
+ X(TOKEN_TITS) \
+ X(TOKEN_RPAREN) \
+ X(TOKEN_LPAREN)
+
+#define TOKEN_ENUM(a) a,
+#define TOKEN_STRING(a) #a,
+
+enum token {
+ TOKENS(TOKEN_ENUM)
+};
+
+const char * const token_to_string[] = {
+ TOKENS(TOKEN_STRING)
+};
+struct string_token *strings = (struct string_token[]){
+ {"test", TOKEN_TEST},
+ {"retarded", TOKEN_RETARDED},
+ {"wow", TOKEN_WOW},
+ {"tits", TOKEN_TITS},
+};
+
+const size_t nstrings = 4;
+
+const uint8_t char_to_bit[256] = {
+ ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7,
+ ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
+ ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19,
+ ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25,
+ ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1
+};
+
+int main(void)
+{
+ struct dict d = {0};
+ d.strings = strings;
+ d.nstrings = nstrings;
+ d.char_to_bit = char_to_bit;
+
+ dict_compile(&d);
+
+ int t;
+ if((t = dict_check(&d, "tits")) >= 0) printf("%s\n", token_to_string[t]);
+ if((t = dict_check(&d, "retarded")) >= 0) printf("%s\n", token_to_string[t]);
+ if((t = dict_check(&d, "test2")) >= 0) printf("%s\n", token_to_string[t]);
+ if((t = dict_check(&d, "tes")) >= 0) printf("%s\n", token_to_string[t]);
+
+ dict_free(&d);
+}
+
+#endif