diff options
Diffstat (limited to 'lexer.c')
-rw-r--r-- | lexer.c | 88 |
1 files changed, 48 insertions, 40 deletions
@@ -10,12 +10,14 @@ // and more characters // - add easier way to write chars to bits (maybe a singe string) +#define ARR_LEN(arr) (sizeof(arr) / sizeof(*arr)) -#define ARR_LENGTH(arr) (sizeof(arr)/sizeof(*arr)) +typedef int token; +extern const char *const token_to_string[]; +extern const struct string_token { char *s; token t;} strings[]; +extern const token separators[]; -#define popcount(x) (__builtin_popcount(x)) - -#define MAPPED_CHARS 32 +#ifdef _LEXER_STANDALONE #define TOKENS(X) \ X(TOKEN_NONE) \ @@ -33,13 +35,13 @@ enum token { TOKENS(TOKEN_ENUM) }; -const char * const to_string[] = { +const char * const token_to_string[] = { TOKENS(TOKEN_STRING) }; -const struct { +const struct string_token { char *s; - enum token t; + token t; } strings[] = { {"test", TOKEN_TEST}, {"retarded", TOKEN_RETARDED}, @@ -47,6 +49,10 @@ const struct { {"tits", TOKEN_TITS}, }; +const token separators[] = {['{'] = TOKEN_LPAREN, ['}'] = TOKEN_RPAREN}; + +#endif + const uint8_t char_to_bit[256] = { ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, @@ -55,26 +61,24 @@ const uint8_t char_to_bit[256] = { ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 }; -const enum token separators[] = { - ['{'] = TOKEN_LPAREN, - ['}'] = TOKEN_RPAREN -}; - struct level { uint64_t bit_mask; uint64_t *token_masks; }; -struct level start_level = {0}; -struct level *bit_to_ptr[MAPPED_CHARS] = {0}; +#define MAPPED_CHARS 32 +static struct level start_level = {0}; +static struct level *bit_to_ptr[MAPPED_CHARS] = {0}; static size_t num_levels; #define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) -int compile_tables(void) +#define popcount(x) (__builtin_popcount(x)) + +int compile_lextables(void) { // max number of levels - for(size_t i = 0; i < ARR_LENGTH(strings); i++) + for(size_t i = 0; i < ARR_LEN(strings); i++) if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); // allocated levels @@ -86,11 +90,11 @@ int compile_tables(void) // BUG: everything is repeated for the start_level // populate bit_masks - for(size_t i = 0; i < ARR_LENGTH(strings); i++) { + for(size_t i = 0; i < ARR_LEN(strings); i++) { struct level *l = &start_level; for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { uint8_t bit = char_to_bit[strings[i].s[j]]; - + l->bit_mask |= 1 << bit; l = &bit_to_ptr[bit][j]; } @@ -105,50 +109,50 @@ int compile_tables(void) * sizeof(*l->token_masks)); memset(l->token_masks, 0, popcount(l->bit_mask) * sizeof(*l->token_masks)); - + l = &bit_to_ptr[i][j]; } } // populate token_masks - for(size_t i = 0; i < ARR_LENGTH(strings); i++) { + for(size_t i = 0; i < ARR_LEN(strings); i++) { struct level *l = &start_level; for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { - uint8_t bit = char_to_bit[strings[i].s[j]]; + uint8_t bit = char_to_bit[strings[i].s[j]]; uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); l->token_masks[idx] |= 1 << strings[i].t; l = &bit_to_ptr[bit][j]; } } - + return 0; } -void print_tables(void) +void print_lextables(void) { - for(size_t i = 0; i < 256; i++) + for(size_t i = 0; i < 256; i++) for(size_t j = 0; j < num_levels; j++) if(CHAR_TO_PTR(i)[j].bit_mask) { - printf("%c, %d, %32b ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); + printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); printf("{ "); for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) - printf("%b ", CHAR_TO_PTR(i)[j].token_masks[k]); + printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]); printf(" }\n"); } - - printf(" %32b ", start_level.bit_mask); + + printf(" %32lb ", start_level.bit_mask); printf("{ "); for(size_t k = 0; k < popcount(start_level.bit_mask); k++) - printf("%b ", start_level.token_masks[k]); + printf("%lb ", start_level.token_masks[k]); printf(" }\n"); printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); } -int tokenize_string(char *string, enum token *t, size_t t_len) +int tokenize_string(char *string, token *t, size_t t_len) { size_t ntokens = 0; size_t i = 0; @@ -170,14 +174,14 @@ int tokenize_string(char *string, enum token *t, size_t t_len) while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++; break; } - + uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); token_mask &= l->token_masks[idx]; if(bit == 1) break; i++; } - + // BUG: not checking of ntokens is in t_len if(token_mask) t[ntokens++] = __builtin_ctz(token_mask); @@ -188,11 +192,11 @@ int tokenize_string(char *string, enum token *t, size_t t_len) off = ++i; } - + return ntokens; } -void free_tables(void) +void free_lextables(void) { free(start_level.token_masks); for(size_t i = 0; i < MAPPED_CHARS; i++) { @@ -204,22 +208,26 @@ void free_tables(void) } } +#ifdef _LEXER_STANDALONE + // exit with a given code, default is 1 #define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1) int main(void) { - compile_tables() && DIE(); + compile_lextables() && DIE(); - enum token t[120] = {0}; + token t[120] = {0}; size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120); ntokens || DIE(10); - + for(size_t i = 0; i < ntokens; i++) - printf("%s\n", to_string[t[i]]); + printf("%s\n", token_to_string[t[i]]); - print_tables(); - free_tables(); + print_lextables(); + free_lextables(); return 0; } + +#endif |