aboutsummaryrefslogtreecommitdiff
path: root/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c225
1 files changed, 225 insertions, 0 deletions
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..6df77c6
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,225 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+// TODO: - make it more memory efficient by allocating only
+// the need amount of level for each letter
+// - add more than 64 bits "types" for more tokens
+// and more characters
+// - add easier way to write chars to bits (maybe a singe string)
+
+
+#define ARR_LENGTH(arr) (sizeof(arr)/sizeof(*arr))
+
+#define popcount(x) (__builtin_popcount(x))
+
+#define MAPPED_CHARS 32
+
+#define TOKENS(X) \
+ X(TOKEN_NONE) \
+ X(TOKEN_TEST) \
+ X(TOKEN_RETARDED) \
+ X(TOKEN_WOW) \
+ X(TOKEN_TITS) \
+ X(TOKEN_RPAREN) \
+ X(TOKEN_LPAREN)
+
+#define TOKEN_ENUM(a) a,
+#define TOKEN_STRING(a) #a,
+
+enum token {
+ TOKENS(TOKEN_ENUM)
+};
+
+const char * const to_string[] = {
+ TOKENS(TOKEN_STRING)
+};
+
+const struct {
+ char *s;
+ enum token t;
+} strings[] = {
+ {"test", TOKEN_TEST},
+ {"retarded", TOKEN_RETARDED},
+ {"wow", TOKEN_WOW},
+ {"tits", TOKEN_TITS},
+};
+
+const uint8_t char_to_bit[256] = {
+ ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7,
+ ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
+ ['m'] = 14, ['n'] = 15, ['o'] = 16, ['p'] = 17, ['q'] = 18, ['r'] = 19,
+ ['s'] = 20, ['t'] = 21, ['u'] = 22, ['v'] = 23, ['w'] = 24, ['x'] = 25,
+ ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1
+};
+
+const enum token separators[] = {
+ ['{'] = TOKEN_LPAREN,
+ ['}'] = TOKEN_RPAREN
+};
+
+struct level {
+ uint64_t bit_mask;
+ uint64_t *token_masks;
+};
+
+struct level start_level = {0};
+struct level *bit_to_ptr[MAPPED_CHARS] = {0};
+static size_t num_levels;
+
+#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
+
+int compile_tables(void)
+{
+ // max number of levels
+ for(size_t i = 0; i < ARR_LENGTH(strings); i++)
+ if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
+
+ // allocated levels
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ bit_to_ptr[i] = calloc(num_levels, sizeof(*bit_to_ptr[i]));
+ if(!bit_to_ptr[i]) return 1;
+ }
+
+ // BUG: everything is repeated for the start_level
+
+ // populate bit_masks
+ for(size_t i = 0; i < ARR_LENGTH(strings); i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+ uint8_t bit = char_to_bit[strings[i].s[j]];
+
+ l->bit_mask |= 1 << bit;
+ l = &bit_to_ptr[bit][j];
+ }
+ }
+
+ // allocate token_masks
+ // NOTE: start_level alloc'd many times, so realloc is used
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < num_levels + 1; j++) {
+ l->token_masks = realloc(l->token_masks, popcount(l->bit_mask)
+ * sizeof(*l->token_masks));
+ memset(l->token_masks, 0, popcount(l->bit_mask)
+ * sizeof(*l->token_masks));
+
+ l = &bit_to_ptr[i][j];
+ }
+ }
+
+ // populate token_masks
+ for(size_t i = 0; i < ARR_LENGTH(strings); i++) {
+ struct level *l = &start_level;
+ for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
+ uint8_t bit = char_to_bit[strings[i].s[j]];
+ uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+
+ l->token_masks[idx] |= 1 << strings[i].t;
+ l = &bit_to_ptr[bit][j];
+ }
+ }
+
+ return 0;
+}
+
+void print_tables(void)
+{
+ for(size_t i = 0; i < 256; i++)
+ for(size_t j = 0; j < num_levels; j++)
+ if(CHAR_TO_PTR(i)[j].bit_mask) {
+ printf("%c, %d, %32b ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
+
+ printf("{ ");
+ for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
+ printf("%b ", CHAR_TO_PTR(i)[j].token_masks[k]);
+ printf(" }\n");
+ }
+
+
+ printf(" %32b ", start_level.bit_mask);
+ printf("{ ");
+ for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
+ printf("%b ", start_level.token_masks[k]);
+ printf(" }\n");
+
+ printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
+}
+
+int tokenize_string(char *string, enum token *t, size_t t_len)
+{
+ size_t ntokens = 0;
+ size_t i = 0;
+ size_t off = 0;
+
+ while(i < strlen(string) + 1) {
+ uint64_t token_mask = ~(uint64_t)0;
+ while(1) {
+ struct level *l = (i-off == 0)
+ ? &start_level
+ : &bit_to_ptr[char_to_bit[string[i-1]]][i-1-off];
+
+ uint8_t bit = (separators[string[i]])
+ ? 1
+ : char_to_bit[string[i]];
+
+ if((l->bit_mask & (1 << bit)) == 0) {
+ token_mask = 0;
+ while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++;
+ break;
+ }
+
+ uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
+ token_mask &= l->token_masks[idx];
+
+ if(bit == 1) break;
+ i++;
+ }
+
+ // BUG: not checking of ntokens is in t_len
+ if(token_mask)
+ t[ntokens++] = __builtin_ctz(token_mask);
+ else if(off != i) t[ntokens++] = TOKEN_NONE;
+
+ if(separators[string[i]])
+ t[ntokens++] = separators[string[i]];
+
+ off = ++i;
+ }
+
+ return ntokens;
+}
+
+void free_tables(void)
+{
+ free(start_level.token_masks);
+ for(size_t i = 0; i < MAPPED_CHARS; i++) {
+ for(size_t j = 0; j < num_levels; j++) {
+ if(bit_to_ptr[i][j].token_masks)
+ free(bit_to_ptr[i][j].token_masks);
+ }
+ free(bit_to_ptr[i]);
+ }
+}
+
+// exit with a given code, default is 1
+#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1)
+
+int main(void)
+{
+ compile_tables() && DIE();
+
+ enum token t[120] = {0};
+ size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120);
+
+ ntokens || DIE(10);
+
+ for(size_t i = 0; i < ntokens; i++)
+ printf("%s\n", to_string[t[i]]);
+
+ print_tables();
+ free_tables();
+ return 0;
+}