From bcac686c1bf6a5c1dec2324269e2766babdc0fde Mon Sep 17 00:00:00 2001 From: kartofen Date: Sat, 17 Jun 2023 23:42:31 +0300 Subject: lexer - done --- src/lexer.c | 330 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 src/lexer.c (limited to 'src/lexer.c') diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..1acfd6d --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,330 @@ +#include +#include +#include +#include + +// TODO: handle escaped quotes +#include "common.h" +#include "lexer.h" + +// saves a token with no data +// returns the index of the saved token; < 0 on fail +static int save_empty_token(lexer_t lexer, enum token_enum type); + +// saves a token with data which is the current identifier (lexer->iden) +// returns 0 on success +static int save_current_identifier(lexer_t lexer); + +// used for tokens that separate things, type is optional (TOKEN_TOKENS for default) +// returns 0 on success, < 0 on fail, and > 0 to skip the token (add it in iden) +static int on_generic_separator(lexer_t lexer, enum token_enum type); +static int on_quote(lexer_t lexer); +static int on_dot(lexer_t lexer); + +// try to convert the identifier (lexer->iden) to a given type +// returns > 0 on sucess, 0 on fail (iden isnt the given type), +// and < 0 on error +static int try_str(lexer_t lexer); +static int try_int(lexer_t lexer); +static int try_float(lexer_t lexer); +static int try_symbol(lexer_t lexer); + +#define SEPARATOR_CALLBACK_TBL(X, lexer) \ + X(EQ('('), on_generic_separator(lexer, TOKEN_PARENTHS_OPEN)) \ + X(EQ(')'), on_generic_separator(lexer, TOKEN_PARENTHS_CLOSE)) \ + X(EQ('\''), on_generic_separator(lexer, TOKEN_SPECIAL_QUOTE)) \ + X(EQ('.'), on_dot(lexer)) \ + X(EQ('"'), on_quote(lexer)) \ + X(FN(isspace), on_generic_separator(lexer, TOKEN_TOKENS)) + +#define IDENTIFY_IDENTIFIER_LIST(X) \ + X(try_str) \ + X(try_int) \ + X(try_float) \ + X(try_symbol) + +// X(token type, what to free, how to print on screen) +#define TOKEN_TYPES_INFO(X, token) \ + X(TOKEN_PARENTHS_OPEN, NULL, "'('") \ + X(TOKEN_PARENTHS_CLOSE, NULL, "')'") \ + X(TOKEN_LITERAL_STRING, token->string, "%s", token->string) \ + X(TOKEN_LITERAL_NUM_INT, NULL, "%ld", token->num_int) \ + X(TOKEN_LITERAL_NUM_FLOAT, NULL, "%f", token->num_float) \ + X(TOKEN_SYMBOL, token->symbol, "%s", token->symbol) \ + +#define EQ(ch) ch == +#define FN(f) f + +// makes an if-else chain to test the character +// agains the seperator callback table +#define CHECK_SEPERATOR_AND_CALLBACK(test_func, callback) \ + if(test_func(str[i])) { \ + callback_ret = callback; \ + if(callback_ret == 0) { \ + continue; \ + } else if(callback_ret < 0) { \ + err(#callback ": failed"); \ + return 1; \ + } \ + } else + +int lexer_tokenize(lexer_t lexer, char *str, size_t len) +{ + int callback_ret = 0; + + for(size_t i = 0; i < len; i++) + { + SEPARATOR_CALLBACK_TBL(CHECK_SEPERATOR_AND_CALLBACK, lexer) {} + + if(lexer->iden_sz >= LEXER_IDEN_CAP - 1) { // -1 to be null-terminated + err("LEXER_IDEN_CAP of %ld reached", lexer->iden_sz); + return 1; + } + + // add charater to identifier + lexer->iden[lexer->iden_sz++] = str[i]; + } + + return 0; +} + +lexer_t lexer_create(size_t tokens_cap) +{ + lexer_t lexer = malloc(sizeof(struct lexer)); + if(!lexer) { + err("malloc: %s", strerror(errno)); + goto fail; + } + + lexer->tokens = calloc(tokens_cap, sizeof(struct token)); + if(!lexer->tokens) { + err("malloc %s", strerror(errno)); + goto fail; + } + + for(size_t i = 0; i < tokens_cap; i++) { + lexer->tokens[i].symbol = NULL; + } + + lexer->tokens_cap = tokens_cap; + lexer->ntokens = 0; + + memset(lexer->iden, 0, LEXER_IDEN_CAP); + lexer->iden_sz = 0; + + lexer->inside_string = 0; + + return lexer; +fail: + lexer_destroy(lexer); + return NULL; +} + +#define CASE_FREE_TOKEN(type, data, ...) \ + case type: if(data != NULL) { free(data); } break; + +void lexer_destroy(lexer_t lexer) +{ + if(!lexer) return; + + if(lexer->tokens) { + for(size_t i = 0; i < lexer->ntokens; i++) { + struct token *token = &lexer->tokens[i]; + switch(token->type) { + TOKEN_TYPES_INFO(CASE_FREE_TOKEN, token) + default: break; + } + } + free(lexer->tokens); + } + + free(lexer); +} + +// ------------------------------------------------- // + +static int on_quote(lexer_t lexer) +{ + int ret = on_generic_separator(lexer, TOKEN_TOKENS); + if(ret == 0) { + lexer->inside_string = 1; + return ret; + } else if(ret > 0) { + lexer->inside_string = 0; + return 0; + } + + return ret; +} + +static int on_dot(lexer_t lexer) +{ + if(lexer->iden_sz != 0) return 1; + on_generic_separator(lexer, TOKEN_SPECIAL_DOT); +} + +static int on_generic_separator(lexer_t lexer, enum token_enum type) +{ + if(lexer->inside_string) { + return 1; + } + + if(save_current_identifier(lexer)) { + err("save_current_identifier: failed"); + return -1; + } + + if(type != TOKEN_TOKENS) { + if(save_empty_token(lexer, type) < 0) { + err("save_empty_token: failed"); + return -1; + } + } + + return 0; +} + +static int save_empty_token(lexer_t lexer, enum token_enum type) +{ + if(lexer->ntokens >= lexer->tokens_cap) { + err("tokens_cap of %ld has been reached", lexer->tokens_cap); + return -1; + } + + lexer->tokens[lexer->ntokens++].type = type; + return lexer->ntokens - 1; +} + +#define CHECK_IDEN(func) \ + if((ret = func(lexer))) { \ + if(ret < 0) { \ + err(#func ": failed"); \ + goto exit; \ + } \ + } else + +static int save_current_identifier(lexer_t lexer) +{ + int ret = 1; + + if(lexer->iden_sz != 0) { + IDENTIFY_IDENTIFIER_LIST(CHECK_IDEN) {} + } + + ret = 0; +exit: + memset(lexer->iden, 0, lexer->iden_sz); + lexer->iden_sz = 0; + return ret; +} + + +// ------------------------------------------------- // + +static int try_str(lexer_t lexer) +{ + if(!lexer->inside_string) return 0; + + int i = save_empty_token(lexer, TOKEN_LITERAL_STRING); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].string = malloc(lexer->iden_sz+1); + if(!lexer->tokens[i].string) { + err("malloc: %s", strerror(errno)); + return -1; + } + + memcpy(lexer->tokens[i].string, lexer->iden, lexer->iden_sz+1); + return 1; +} + +static int try_int(lexer_t lexer) +{ + errno = ERANGE + 1; // set errno to not ERANGE + + char *endptr; + long num = strtol(lexer->iden, &endptr, 10); + + if(*endptr != '\0') { // the whole string isn't a number + return 0; + } + + if(errno == ERANGE) { + warn("Given integer literal %s is outside the possible range", lexer->iden); + } + + int i = save_empty_token(lexer, TOKEN_LITERAL_NUM_INT); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].num_int = num; + return 1; +} + +static int try_float(lexer_t lexer) +{ + errno = ERANGE + 1; // set errno to not ERANGE + + char *endptr; + float num = strtof(lexer->iden, &endptr); + + if(*endptr != '\0') { // the whole string isn't a number + return 0; + } + + if(errno == ERANGE) { + warn("Given float literal %s is outside the possible range", lexer->iden); + } + + int i = save_empty_token(lexer, TOKEN_LITERAL_NUM_FLOAT); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].num_float = num; + return 1; +} + +static int try_symbol(lexer_t lexer) +{ + int i = save_empty_token(lexer, TOKEN_SYMBOL); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].symbol = malloc(lexer->iden_sz+1); + if(!lexer->tokens[i].symbol) { + err("malloc: %s", strerror(errno)); + return -1; + } + + memcpy(lexer->tokens[i].symbol, lexer->iden, lexer->iden_sz+1); + return 1; +} + +// ------------------------------------------------- // + +#ifdef DEBUG +#define CASE_PRINT(type, data, ...) case type: info("\t" __VA_ARGS__); break; + +void lexer_print_tokens(lexer_t lexer) +{ + for(size_t i = 0; i < lexer->ntokens; i++) { + struct token *token = &lexer->tokens[i]; + + info("Token %zu: %d", i, token->type); + + switch(token->type) { + TOKEN_TYPES_INFO(CASE_PRINT, token); + default: break; + } + } +} +#endif -- cgit v1.2.3