diff options
-rwxr-xr-x | build.sh | 6 | ||||
-rw-r--r-- | lexer.c | 88 | ||||
-rw-r--r-- | lr0-table.c (renamed from lr0.c) | 284 |
3 files changed, 204 insertions, 174 deletions
@@ -2,12 +2,12 @@ set -xe -# gcc -Wall -Wextra -g lexer.c -o lexer +# gcc -Wall -Wextra -g -D_LEXER_STANDALONE lexer.c -o lexer # gcc -Wall -Wextra -g recursive-ascent.c -o recursive-ascent # gcc -Wall -Wextra -g recursive-ascent-descent.c -o recursive-ascent-descent -gcc -Wall -Wextra -g lr0.c -o lr0 +gcc -Wall -Wextra -g lr0-table.c -D_LR0_TABLE_STANDALONE -o lr0-table # valgrind --leak-check=full --show-leak-kinds=all -s ./lexer # valgrind --leak-check=full --show-leak-kinds=all -s ./recursive-ascent # valgrind --leak-check=full --show-leak-kinds=all -s ./recursive-ascent-descent -valgrind --leak-check=full --show-leak-kinds=all -s ./lr0 +valgrind --leak-check=full --show-leak-kinds=all -s ./lr0-table @@ -10,12 +10,14 @@ // and more characters // - add easier way to write chars to bits (maybe a singe string) +#define ARR_LEN(arr) (sizeof(arr) / sizeof(*arr)) -#define ARR_LENGTH(arr) (sizeof(arr)/sizeof(*arr)) +typedef int token; +extern const char *const token_to_string[]; +extern const struct string_token { char *s; token t;} strings[]; +extern const token separators[]; -#define popcount(x) (__builtin_popcount(x)) - -#define MAPPED_CHARS 32 +#ifdef _LEXER_STANDALONE #define TOKENS(X) \ X(TOKEN_NONE) \ @@ -33,13 +35,13 @@ enum token { TOKENS(TOKEN_ENUM) }; -const char * const to_string[] = { +const char * const token_to_string[] = { TOKENS(TOKEN_STRING) }; -const struct { +const struct string_token { char *s; - enum token t; + token t; } strings[] = { {"test", TOKEN_TEST}, {"retarded", TOKEN_RETARDED}, @@ -47,6 +49,10 @@ const struct { {"tits", TOKEN_TITS}, }; +const token separators[] = {['{'] = TOKEN_LPAREN, ['}'] = TOKEN_RPAREN}; + +#endif + const uint8_t char_to_bit[256] = { ['a'] = 2, ['b'] = 3, ['c'] = 4, ['d'] = 5, ['e'] = 6, ['f'] = 7, ['g'] = 8, ['h'] = 9, ['i'] = 10, ['j'] = 11, ['k'] = 12, ['l'] = 13, @@ -55,26 +61,24 @@ const uint8_t char_to_bit[256] = { ['y'] = 26, ['z'] = 27, [ 0 ] = 1, [' '] = 1 }; -const enum token separators[] = { - ['{'] = TOKEN_LPAREN, - ['}'] = TOKEN_RPAREN -}; - struct level { uint64_t bit_mask; uint64_t *token_masks; }; -struct level start_level = {0}; -struct level *bit_to_ptr[MAPPED_CHARS] = {0}; +#define MAPPED_CHARS 32 +static struct level start_level = {0}; +static struct level *bit_to_ptr[MAPPED_CHARS] = {0}; static size_t num_levels; #define CHAR_TO_PTR(c) (bit_to_ptr[char_to_bit[c]]) -int compile_tables(void) +#define popcount(x) (__builtin_popcount(x)) + +int compile_lextables(void) { // max number of levels - for(size_t i = 0; i < ARR_LENGTH(strings); i++) + for(size_t i = 0; i < ARR_LEN(strings); i++) if(strlen(strings[i].s) > num_levels) num_levels = strlen(strings[i].s); // allocated levels @@ -86,11 +90,11 @@ int compile_tables(void) // BUG: everything is repeated for the start_level // populate bit_masks - for(size_t i = 0; i < ARR_LENGTH(strings); i++) { + for(size_t i = 0; i < ARR_LEN(strings); i++) { struct level *l = &start_level; for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { uint8_t bit = char_to_bit[strings[i].s[j]]; - + l->bit_mask |= 1 << bit; l = &bit_to_ptr[bit][j]; } @@ -105,50 +109,50 @@ int compile_tables(void) * sizeof(*l->token_masks)); memset(l->token_masks, 0, popcount(l->bit_mask) * sizeof(*l->token_masks)); - + l = &bit_to_ptr[i][j]; } } // populate token_masks - for(size_t i = 0; i < ARR_LENGTH(strings); i++) { + for(size_t i = 0; i < ARR_LEN(strings); i++) { struct level *l = &start_level; for(size_t j = 0; j < strlen(strings[i].s)+1; j++) { - uint8_t bit = char_to_bit[strings[i].s[j]]; + uint8_t bit = char_to_bit[strings[i].s[j]]; uint8_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); l->token_masks[idx] |= 1 << strings[i].t; l = &bit_to_ptr[bit][j]; } } - + return 0; } -void print_tables(void) +void print_lextables(void) { - for(size_t i = 0; i < 256; i++) + for(size_t i = 0; i < 256; i++) for(size_t j = 0; j < num_levels; j++) if(CHAR_TO_PTR(i)[j].bit_mask) { - printf("%c, %d, %32b ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); + printf("%c, %zu, %32lb ", (char)i, j, CHAR_TO_PTR(i)[j].bit_mask); printf("{ "); for(size_t k = 0; k < popcount(CHAR_TO_PTR(i)[j].bit_mask); k++) - printf("%b ", CHAR_TO_PTR(i)[j].token_masks[k]); + printf("%lb ", CHAR_TO_PTR(i)[j].token_masks[k]); printf(" }\n"); } - - printf(" %32b ", start_level.bit_mask); + + printf(" %32lb ", start_level.bit_mask); printf("{ "); for(size_t k = 0; k < popcount(start_level.bit_mask); k++) - printf("%b ", start_level.token_masks[k]); + printf("%lb ", start_level.token_masks[k]); printf(" }\n"); printf(" %32s\n", "zyxwvutsrqponmlkjihgfedcbaE "); } -int tokenize_string(char *string, enum token *t, size_t t_len) +int tokenize_string(char *string, token *t, size_t t_len) { size_t ntokens = 0; size_t i = 0; @@ -170,14 +174,14 @@ int tokenize_string(char *string, enum token *t, size_t t_len) while(!separators[string[i]] && char_to_bit[string[i]] != 1) i++; break; } - + uint64_t idx = popcount(l->bit_mask & ((1 << bit) - 1)); token_mask &= l->token_masks[idx]; if(bit == 1) break; i++; } - + // BUG: not checking of ntokens is in t_len if(token_mask) t[ntokens++] = __builtin_ctz(token_mask); @@ -188,11 +192,11 @@ int tokenize_string(char *string, enum token *t, size_t t_len) off = ++i; } - + return ntokens; } -void free_tables(void) +void free_lextables(void) { free(start_level.token_masks); for(size_t i = 0; i < MAPPED_CHARS; i++) { @@ -204,22 +208,26 @@ void free_tables(void) } } +#ifdef _LEXER_STANDALONE + // exit with a given code, default is 1 #define DIE(...) (printf("ERROR %s:%d\n", __FILE__, __LINE__), __VA_OPT__(exit(__VA_ARGS__),) exit(1), 1) int main(void) { - compile_tables() && DIE(); + compile_lextables() && DIE(); - enum token t[120] = {0}; + token t[120] = {0}; size_t ntokens = tokenize_string("tits tits2 retarded wow{tits}test test }}{{test", t, 120); ntokens || DIE(10); - + for(size_t i = 0; i < ntokens; i++) - printf("%s\n", to_string[t[i]]); + printf("%s\n", token_to_string[t[i]]); - print_tables(); - free_tables(); + print_lextables(); + free_lextables(); return 0; } + +#endif @@ -1,45 +1,33 @@ #include <stdio.h> #include <stdlib.h> -#define ARR_LEN(...) (sizeof(__VA_ARGS__) / sizeof(*(__VA_ARGS__))) +#ifndef MAX_SYMBOLS +#define MAX_SYMBOLS 64 +#endif -enum symbol { - PLUS = 0, - MINUS, - LPAREN, - RPAREN, - N0, N1, - END_INPUT, - - EP, E, T, N, - SYMBOL_COUNT, -}; - -static inline int is_terminal(enum symbol s) { return s < E; } -static inline int is_nonterminal(enum symbol s) { return s >= E; } +#ifdef XCALLOC_IMPLEMENTED +extern void *xcalloc(size_t n, size_t size); +#else +void *xcalloc(size_t n, size_t size) { void *addr = calloc(n, size); return addr ? addr : (exit(1), NULL); } +#endif -struct production { - enum symbol LHS; - enum symbol *RHS; - size_t nRHS; -}; +typedef unsigned int symbol; -#define PROD(LHS, _, ...) {LHS, (enum symbol[]){__VA_ARGS__}, sizeof((enum symbol[]){__VA_ARGS__})/sizeof(enum symbol)} +extern const size_t total_symbols; +extern int symbol_is_terminal(symbol s); +extern int symbol_is_nonterminal(symbol s); +extern int symbol_is_input_end(symbol s); -static struct production grammar[] = { - PROD(EP, ->, E, END_INPUT), - PROD(E, -->, E, PLUS, T), - PROD(E, -->, E, MINUS, T), - PROD(E, -->, T), - PROD(T, -->, LPAREN, E, RPAREN), - PROD(T, -->, N), - PROD(N, -->, N0), - PROD(N, -->, N1), -}; +extern struct production { + symbol LHS; + symbol *RHS; + size_t nRHS; +} grammar[]; +extern const size_t total_productions; -void print_grammar() +void grammar_print() { - for(size_t i = 0; i < ARR_LEN(grammar); i++) { + for(size_t i = 0; i < total_productions; i++) { printf("%d --> ", grammar[i].LHS); for(size_t j = 0; j < grammar[i].nRHS; j++) printf("%d ", grammar[i].RHS[j]); @@ -47,21 +35,10 @@ void print_grammar() } } -static int follow[SYMBOL_COUNT][SYMBOL_COUNT]; -void fill_follow_table(); -void print_follow_table() -{ - printf(" "); - for(size_t i = 0; i < SYMBOL_COUNT; i++) printf("%2zu ", i); - printf("\n"); - - for(size_t i = 0; i < SYMBOL_COUNT; i++) { - printf("%2zu ", i); - for(size_t j = 0; j < SYMBOL_COUNT; j++) - printf(" %s ", follow[i][j] ? "x" : " "); - printf("\n"); - } -} +static int follow[MAX_SYMBOLS][MAX_SYMBOLS]; + +void follow_table_fill(); +void follow_table_print(); struct action { enum action_type { @@ -72,45 +49,19 @@ struct action { size_t arg; }; -// THIS IS BAD, DONT DO IT LIKE THAT #define TABLE_CAP 64 -static struct action table[TABLE_CAP][SYMBOL_COUNT]; -// DO IT LIKE THAT -// static struct action *(table[SYMBOL_COUNT]); -static size_t states; - -int table_insert(size_t state, enum symbol sym, struct action a); -void print_table() -{ - printf(" "); - for(size_t sym = 0; sym < SYMBOL_COUNT; sym++) printf("%2zu ", sym); - printf("\n"); - - char action_to_char[] = {[ACTION_SHIFT] = 's', [ACTION_REDUCE] = 'r', [ACTION_GOTO] = 'g'}; - for(size_t i = 0; i < states; i++) { - printf("%2zu ", i); - for(size_t sym = 0; sym < SYMBOL_COUNT; sym++) - if(table[i][sym].type == ACTION_ACCEPT) printf(" a "); - else if(table[i][sym].type) printf("%c%-2zu ", action_to_char[table[i][sym].type], table[i][sym].arg); - else printf(" "); - printf("\n"); - } -} +static struct action table[TABLE_CAP][MAX_SYMBOLS]; // *(table[total_symbols]); +static size_t table_states; +int table_insert(size_t state, symbol sym, struct action a); +void table_print(); struct item { size_t prod_idx; size_t dot; }; -int items_eq(struct item *i1, struct item *i2) { return (i1->dot == i2->dot && i1->prod_idx == i2->prod_idx) ? 1 : 0; } -void print_set(struct item *set, size_t nset) -{ - printf("{"); - for(size_t i = 0; i < nset; i++) - printf("{%zu, %zu}, ", set[i].prod_idx, set[i].dot); - printf("}\n"); -} +int item_eq(struct item *i1, struct item *i2) { return (i1->dot == i2->dot && i1->prod_idx == i2->prod_idx) ? 1 : 0; } #define SEEN_SETS_CAP 64 static struct { @@ -120,35 +71,30 @@ static struct { } seen_sets[SEEN_SETS_CAP]; static size_t nseen_sets; -void free_seen_sets() { for(size_t i = 0; i < nseen_sets; i++) free(seen_sets[i].items);} - -size_t handle_set(struct item *set, size_t nset); -int insert_set(size_t state, struct item *initial_set, size_t ninitial); -size_t closure(struct item *in_set, size_t nin, struct item *out_set, size_t nout); - -void *xcalloc(size_t n, size_t size) { void *addr = calloc(n, size); return addr ? addr : (exit(1), NULL); } +void seen_sets_free() { for(size_t i = 0; i < nseen_sets; i++) free(seen_sets[i].items);} -int main(void) +size_t itemset_handle(struct item *set, size_t nset); +int itemset_insert(size_t state, struct item *initial_set, size_t ninitial); +size_t itemset_closure(struct item *in_set, size_t nin, struct item *out_set, size_t nout); +void itemset_print(struct item *set, size_t nset) { - fill_follow_table(); - - handle_set((struct item[]){{0, 0}}, 1); - print_table(); - - free_seen_sets(); + printf("{"); + for(size_t i = 0; i < nset; i++) + printf("{%zu, %zu}, ", set[i].prod_idx, set[i].dot); + printf("}\n"); } -size_t handle_set(struct item *set, size_t nset) +size_t itemset_handle(struct item *set, size_t nset) { // 1. is set in seen_sets for(size_t i = 0; i < nseen_sets; i++) { if(seen_sets[i].nitems != nset) continue; - + int _seen = 0; for(size_t j = 0; j < nset; j++) { _seen = 0; for(size_t k = 0; k < nset; k++) - if(items_eq(&seen_sets[i].items[k], &set[j])) _seen = 1; + if(item_eq(&seen_sets[i].items[k], &set[j])) _seen = 1; if(!_seen) break; } if(_seen) return seen_sets[i].state; @@ -157,49 +103,49 @@ size_t handle_set(struct item *set, size_t nset) // 2. add set to seen_sets if(nseen_sets >= SEEN_SETS_CAP) { fprintf(stderr, "ERROR: SEEN_SET_CAP exceeded\n"); - exit(1); + exit(1); } - + seen_sets[nseen_sets].items = xcalloc(nset, sizeof(*set)); seen_sets[nseen_sets].nitems = nset; for(size_t i = 0; i < nset; i++) seen_sets[nseen_sets].items[i] = set[i]; // 3. insert new state - size_t new_state = seen_sets[nseen_sets++].state = states++; + size_t new_state = seen_sets[nseen_sets++].state = table_states++; if(new_state >= TABLE_CAP) { fprintf(stderr, "ERROR: TABLE_CAP exceeded\n"); exit(1); } - if(insert_set(new_state, set, nset)) { - fprintf(stderr, "ERROR: insert_set failed\n"); + if(itemset_insert(new_state, set, nset)) { + fprintf(stderr, "ERROR: itemset_insert failed\n"); exit(1); } - + return new_state; } #define CLOSURE_SET_CAP 64 #define GOTO_SET_CAP 32 -int insert_set(size_t state, struct item *initial_set, size_t ninitial) +int itemset_insert(size_t state, struct item *initial_set, size_t ninitial) { struct item closure_set[CLOSURE_SET_CAP]; - - size_t nclosure = closure(initial_set, ninitial, closure_set, CLOSURE_SET_CAP); + + size_t nclosure = itemset_closure(initial_set, ninitial, closure_set, CLOSURE_SET_CAP); if(nclosure > CLOSURE_SET_CAP) { fprintf(stderr, "ERROR: CLOSURE_SET_CAP exceeded\n"); return 1; } - - for(size_t sym = 0; sym < SYMBOL_COUNT; sym++) { + + for(size_t sym = 0; sym < total_symbols; sym++) { struct item goto_set[GOTO_SET_CAP]; - size_t ngoto = 0; - + size_t ngoto = 0; + for(size_t j = 0; j < nclosure; j++) { struct production *p = &grammar[closure_set[j].prod_idx]; size_t dot = closure_set[j].dot; - + if(dot == p->nRHS) { if(!follow[p->LHS][sym]) continue; if(table_insert(state, sym, (struct action){ @@ -207,50 +153,50 @@ int insert_set(size_t state, struct item *initial_set, size_t ninitial) return 1; continue; } - + if(p->RHS[dot] == sym) { if(ngoto >= GOTO_SET_CAP) { fprintf(stderr, "ERROR: GOTO_SET_CAP exceeded\n"); return 1; - } + } goto_set[ngoto] = closure_set[j]; goto_set[ngoto++].dot++; } } if(ngoto == 0) continue; - - if(sym == END_INPUT) { + + if(symbol_is_input_end(sym)) { if(table_insert(state, sym, (struct action){ACTION_ACCEPT, 0})) return 1; continue; } - - size_t new_state = handle_set(goto_set, ngoto); + + size_t new_state = itemset_handle(goto_set, ngoto); if(table_insert(state, sym, (struct action){ - is_terminal(sym) ? ACTION_SHIFT : ACTION_GOTO, + symbol_is_terminal(sym) ? ACTION_SHIFT : ACTION_GOTO, new_state})) return 1; } - + return 0; } -size_t closure(struct item *in_set, size_t nin, struct item *out_set, size_t nout_max) +size_t itemset_closure(struct item *in_set, size_t nin, struct item *out_set, size_t nout_max) { size_t nout = nin; - + if(nout > nout_max) return nout; for(size_t i = 0; i < nin; i++) out_set[i] = in_set[i]; - int *is_in_closure = xcalloc(ARR_LEN(grammar), sizeof(int)); + int *is_in_closure = xcalloc(total_productions, sizeof(int)); for(size_t i = 0; i < nout; i++) { struct production *p = &grammar[out_set[i].prod_idx]; if(out_set[i].dot == p->nRHS) continue; - enum symbol sym = p->RHS[out_set[i].dot]; - - for(size_t j = 0; j < ARR_LEN(grammar); j++) + symbol sym = p->RHS[out_set[i].dot]; + + for(size_t j = 0; j < total_productions; j++) if(grammar[j].LHS == sym) { if(is_in_closure[j]) continue; is_in_closure[j] = 1; @@ -265,21 +211,21 @@ cleanup: return nout; } -void fill_follow_table() +void follow_table_fill() { - for(size_t i = 0; i < ARR_LEN(grammar); i++) { + for(size_t i = 0; i < total_productions; i++) { struct production *p = &grammar[i]; for(size_t j = 1; j < p->nRHS; j++) follow[p->RHS[j-1]][p->RHS[j]] = 1; } #define set(e) if((e) != 1) changed = (e) = 1 - + int changed; do { changed = 0; - for(size_t i = 0; i < ARR_LEN(grammar); i++) - for(size_t sym = 0; sym < SYMBOL_COUNT; sym++) { + for(size_t i = 0; i < total_productions; i++) + for(size_t sym = 0; sym < total_symbols; sym++) { struct production *p = &grammar[i]; if(follow[p->LHS][sym]) set(follow[p->RHS[p->nRHS-1]][sym]); @@ -289,7 +235,21 @@ void fill_follow_table() } while(changed); } -int table_insert(size_t state, enum symbol sym, struct action a) +void follow_table_print() +{ + printf(" "); + for(size_t i = 0; i < total_symbols; i++) printf("%2zu ", i); + printf("\n"); + + for(size_t i = 0; i < total_symbols; i++) { + printf("%2zu ", i); + for(size_t j = 0; j < total_symbols; j++) + printf(" %s ", follow[i][j] ? "x" : " "); + printf("\n"); + } +} + +int table_insert(size_t state, symbol sym, struct action a) { if(table[state][sym].type != ACTION_NOT_SET) { fprintf(stderr, "TABLE COLLISION on state '%zu' sym '%d'\n", state, sym); @@ -298,7 +258,69 @@ int table_insert(size_t state, enum symbol sym, struct action a) a.type, a.arg); return 1; } - + table[state][sym] = a; return 0; } + +void table_print() +{ + printf(" "); + for(size_t sym = 0; sym < total_symbols; sym++) printf("%2zu ", sym); + printf("\n"); + + char action_to_char[] = {[ACTION_SHIFT] = 's', [ACTION_REDUCE] = 'r', [ACTION_GOTO] = 'g'}; + for(size_t i = 0; i < table_states; i++) { + printf("%2zu ", i); + for(size_t sym = 0; sym < total_symbols; sym++) + if(table[i][sym].type == ACTION_ACCEPT) printf(" a "); + else if(table[i][sym].type) printf("%c%-2zu ", action_to_char[table[i][sym].type], table[i][sym].arg); + else printf(" "); + printf("\n"); + } +} + +#ifdef _LR0_TABLE_STANDALONE + +enum symbol { + PLUS = 0, + MINUS, + LPAREN, + RPAREN, + N0, N1, + END_INPUT, + + EP, E, T, N, + SYMBOLS_END, +}; +const size_t total_symbols = SYMBOLS_END; + +int symbol_is_terminal(symbol s) { return s < E; } +int symbol_is_nonterminal(symbol s) { return s >= E; } +int symbol_is_input_end(symbol s) { return s == END_INPUT; } + +#define PROD(LHS, _, ...) {LHS, (symbol[]){__VA_ARGS__}, sizeof((symbol[]){__VA_ARGS__})/sizeof(symbol)} +struct production grammar[] = { + PROD(EP, ->, E, END_INPUT), + PROD(E, -->, E, PLUS, T), + PROD(E, -->, E, MINUS, T), + PROD(E, -->, T), + PROD(T, -->, LPAREN, E, RPAREN), + PROD(T, -->, N), + PROD(N, -->, N0), + PROD(N, -->, N1), +}; + +const size_t total_productions = sizeof(grammar)/sizeof(*grammar); + +int main(void) +{ + follow_table_fill(); + + itemset_handle((struct item[]){{0, 0}}, 1); + table_print(); + + seen_sets_free(); +} + +#endif |