aboutsummaryrefslogtreecommitdiff
path: root/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c88
1 files changed, 48 insertions, 40 deletions
diff --git a/lexer.c b/lexer.c
index 6df77c6..2fca9a7 100644
--- a/lexer.c
+++ b/lexer.c
@@ -10,12 +10,14 @@
// and more characters
// - add easier way to write chars to bits (maybe a singe string)
+#define ARR_LEN(arr) (sizeof(arr) / sizeof(*arr))
-#define ARR_LENGTH(arr) (sizeof(arr)/sizeof(*arr))
+typedef int token;
+extern const char *const token_to_string[];
+extern const struct string_token { char *s; token t;} strings[];
+extern const token separators[];
-#define popcount(x) (__builtin_popcount(x))
-
-#define MAPPED_CHARS 32
+#ifdef _LEXER_STANDALONE
#define TOKENS(X) \
X(TOKEN_NONE) \
@@ -33,13 +35,13 @@ enum token {
TOKENS(TOKEN_ENUM)
};
-const char * const to_string[] = {
+const char * const token_to_string[] = {
TOKENS(TOKEN_STRING)
};
-const struct {
+const struct string_token {
char *s;
- enum token t;
+ token t;
} strings[] = {
{"test", TOKEN_TEST},
{"retarded", TOKEN_RETARDED},
@@ -47,6 +49,10 @@ const struct {
{"tits", TOKEN_TITS},
};
+const token separators[] = {['{'] = TOKEN_LPAREN, ['}'] = TOKEN_RPAREN};
+
+#endif
+
const uint8_t char_to_bit[256] = {
['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7,
['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13,
@@ -55,26 +61,24 @@ const uint8_t char_to_bit[256] = {
['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1
};
-const enum token separators[] = {
- ['{'] = TOKEN_LPAREN,
- ['}'] = TOKEN_RPAREN
-};
-
struct level {
uint64_t bit_mask;
uint64_t *token_masks;
};
-struct level start_level = {0};
-struct level *bit_to_ptr[MAPPED_CHARS] = {0};
+#define MAPPED_CHARS 32
+static struct level start_level = {0};
+static struct level *bit_to_ptr[MAPPED_CHARS] = {0};
static size_t num_levels;
#define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]])
-int compile_tables(void)
+#define popcount(x) (__builtin_popcount(x))
+
+int compile_lextables(void)
{
// max number of levels
- for(size_t i = 0; i < ARR_LENGTH(strings); i++)
+ for(size_t i = 0; i < ARR_LEN(strings); i++)
if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s);
// allocated levels
@@ -86,11 +90,11 @@ int compile_tables(void)
// BUG: everything is repeated for the start_level
// populate bit_masks
- for(size_t i = 0; i < ARR_LENGTH(strings); i++) {
+ for(size_t i = 0; i < ARR_LEN(strings); i++) {
struct level *l = &start_level;
for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
uint8_t bit = char_to_bit[strings[i].s[j]];
-
+
l->bit_mask |= 1 << bit;
l = &bit_to_ptr[bit][j];
}
@@ -105,50 +109,50 @@ int compile_tables(void)
* sizeof(*l->token_masks));
memset(l->token_masks, 0, popcount(l->bit_mask)
* sizeof(*l->token_masks));
-
+
l = &bit_to_ptr[i][j];
}
}
// populate token_masks
- for(size_t i = 0; i < ARR_LENGTH(strings); i++) {
+ for(size_t i = 0; i < ARR_LEN(strings); i++) {
struct level *l = &start_level;
for(size_t j = 0; j < strlen(strings[i].s)+1; j++) {
- uint8_t bit = char_to_bit[strings[i].s[j]];
+ uint8_t bit = char_to_bit[strings[i].s[j]];
uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
l->token_masks[idx] |= 1 << strings[i].t;
l = &bit_to_ptr[bit][j];
}
}
-
+
return 0;
}
-void print_tables(void)
+void print_lextables(void)
{
- for(size_t i = 0; i < 256; i++)
+ for(size_t i = 0; i < 256; i++)
for(size_t j = 0; j < num_levels; j++)
if(CHAR_TO_PTR(i)[j].bit_mask) {
- printf("%c, %d, %32b ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
+ printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask);
printf("{ ");
for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++)
- printf("%b ", CHAR_TO_PTR(i)[j].token_masks[k]);
+ printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]);
printf(" }\n");
}
-
- printf(" %32b ", start_level.bit_mask);
+
+ printf(" %32lb ", start_level.bit_mask);
printf("{ ");
for(size_t k = 0; k < popcount(start_level.bit_mask); k++)
- printf("%b ", start_level.token_masks[k]);
+ printf("%lb ", start_level.token_masks[k]);
printf(" }\n");
printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE ");
}
-int tokenize_string(char *string, enum token *t, size_t t_len)
+int tokenize_string(char *string, token *t, size_t t_len)
{
size_t ntokens = 0;
size_t i = 0;
@@ -170,14 +174,14 @@ int tokenize_string(char *string, enum token *t, size_t t_len)
while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++;
break;
}
-
+
uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1));
token_mask &= l->token_masks[idx];
if(bit == 1) break;
i++;
}
-
+
// BUG: not checking of ntokens is in t_len
if(token_mask)
t[ntokens++] = __builtin_ctz(token_mask);
@@ -188,11 +192,11 @@ int tokenize_string(char *string, enum token *t, size_t t_len)
off = ++i;
}
-
+
return ntokens;
}
-void free_tables(void)
+void free_lextables(void)
{
free(start_level.token_masks);
for(size_t i = 0; i < MAPPED_CHARS; i++) {
@@ -204,22 +208,26 @@ void free_tables(void)
}
}
+#ifdef _LEXER_STANDALONE
+
// exit with a given code, default is 1
#define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1)
int main(void)
{
- compile_tables() && DIE();
+ compile_lextables() && DIE();
- enum token t[120] = {0};
+ token t[120] = {0};
size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120);
ntokens || DIE(10);
-
+
for(size_t i = 0; i < ntokens; i++)
- printf("%s\n", to_string[t[i]]);
+ printf("%s\n", token_to_string[t[i]]);
- print_tables();
- free_tables();
+ print_lextables();
+ free_lextables();
return 0;
}
+
+#endif