aboutsummaryrefslogtreecommitdiff
path: root/src/lexer.c
diff options
context:
space:
mode:
authorkartofen <mladenovnasko0@gmail.com>2023-06-17 23:42:31 +0300
committerkartofen <mladenovnasko0@gmail.com>2023-06-17 23:42:31 +0300
commitbcac686c1bf6a5c1dec2324269e2766babdc0fde (patch)
tree6483461015705efa8290a1ab05482a641739c1dd /src/lexer.c
lexer - done
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c330
1 files changed, 330 insertions, 0 deletions
diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..1acfd6d
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,330 @@
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+
+// TODO: handle escaped quotes
+#include "common.h"
+#include "lexer.h"
+
+// saves a token with no data
+// returns the index of the saved token; < 0 on fail
+static int save_empty_token(lexer_t lexer, enum token_enum type);
+
+// saves a token with data which is the current identifier (lexer->iden)
+// returns 0 on success
+static int save_current_identifier(lexer_t lexer);
+
+// used for tokens that separate things, type is optional (TOKEN_TOKENS for default)
+// returns 0 on success, < 0 on fail, and > 0 to skip the token (add it in iden)
+static int on_generic_separator(lexer_t lexer, enum token_enum type);
+static int on_quote(lexer_t lexer);
+static int on_dot(lexer_t lexer);
+
+// try to convert the identifier (lexer->iden) to a given type
+// returns > 0 on sucess, 0 on fail (iden isnt the given type),
+// and < 0 on error
+static int try_str(lexer_t lexer);
+static int try_int(lexer_t lexer);
+static int try_float(lexer_t lexer);
+static int try_symbol(lexer_t lexer);
+
+#define SEPARATOR_CALLBACK_TBL(X, lexer) \
+ X(EQ('('), on_generic_separator(lexer, TOKEN_PARENTHS_OPEN)) \
+ X(EQ(')'), on_generic_separator(lexer, TOKEN_PARENTHS_CLOSE)) \
+ X(EQ('\''), on_generic_separator(lexer, TOKEN_SPECIAL_QUOTE)) \
+ X(EQ('.'), on_dot(lexer)) \
+ X(EQ('"'), on_quote(lexer)) \
+ X(FN(isspace), on_generic_separator(lexer, TOKEN_TOKENS))
+
+#define IDENTIFY_IDENTIFIER_LIST(X) \
+ X(try_str) \
+ X(try_int) \
+ X(try_float) \
+ X(try_symbol)
+
+// X(token type, what to free, how to print on screen)
+#define TOKEN_TYPES_INFO(X, token) \
+ X(TOKEN_PARENTHS_OPEN, NULL, "'('") \
+ X(TOKEN_PARENTHS_CLOSE, NULL, "')'") \
+ X(TOKEN_LITERAL_STRING, token->string, "%s", token->string) \
+ X(TOKEN_LITERAL_NUM_INT, NULL, "%ld", token->num_int) \
+ X(TOKEN_LITERAL_NUM_FLOAT, NULL, "%f", token->num_float) \
+ X(TOKEN_SYMBOL, token->symbol, "%s", token->symbol) \
+
+#define EQ(ch) ch ==
+#define FN(f) f
+
+// makes an if-else chain to test the character
+// agains the seperator callback table
+#define CHECK_SEPERATOR_AND_CALLBACK(test_func, callback) \
+ if(test_func(str[i])) { \
+ callback_ret = callback; \
+ if(callback_ret == 0) { \
+ continue; \
+ } else if(callback_ret < 0) { \
+ err(#callback ": failed"); \
+ return 1; \
+ } \
+ } else
+
+int lexer_tokenize(lexer_t lexer, char *str, size_t len)
+{
+ int callback_ret = 0;
+
+ for(size_t i = 0; i < len; i++)
+ {
+ SEPARATOR_CALLBACK_TBL(CHECK_SEPERATOR_AND_CALLBACK, lexer) {}
+
+ if(lexer->iden_sz >= LEXER_IDEN_CAP - 1) { // -1 to be null-terminated
+ err("LEXER_IDEN_CAP of %ld reached", lexer->iden_sz);
+ return 1;
+ }
+
+ // add charater to identifier
+ lexer->iden[lexer->iden_sz++] = str[i];
+ }
+
+ return 0;
+}
+
+lexer_t lexer_create(size_t tokens_cap)
+{
+ lexer_t lexer = malloc(sizeof(struct lexer));
+ if(!lexer) {
+ err("malloc: %s", strerror(errno));
+ goto fail;
+ }
+
+ lexer->tokens = calloc(tokens_cap, sizeof(struct token));
+ if(!lexer->tokens) {
+ err("malloc %s", strerror(errno));
+ goto fail;
+ }
+
+ for(size_t i = 0; i < tokens_cap; i++) {
+ lexer->tokens[i].symbol = NULL;
+ }
+
+ lexer->tokens_cap = tokens_cap;
+ lexer->ntokens = 0;
+
+ memset(lexer->iden, 0, LEXER_IDEN_CAP);
+ lexer->iden_sz = 0;
+
+ lexer->inside_string = 0;
+
+ return lexer;
+fail:
+ lexer_destroy(lexer);
+ return NULL;
+}
+
+#define CASE_FREE_TOKEN(type, data, ...) \
+ case type: if(data != NULL) { free(data); } break;
+
+void lexer_destroy(lexer_t lexer)
+{
+ if(!lexer) return;
+
+ if(lexer->tokens) {
+ for(size_t i = 0; i < lexer->ntokens; i++) {
+ struct token *token = &lexer->tokens[i];
+ switch(token->type) {
+ TOKEN_TYPES_INFO(CASE_FREE_TOKEN, token)
+ default: break;
+ }
+ }
+ free(lexer->tokens);
+ }
+
+ free(lexer);
+}
+
+// ------------------------------------------------- //
+
+static int on_quote(lexer_t lexer)
+{
+ int ret = on_generic_separator(lexer, TOKEN_TOKENS);
+ if(ret == 0) {
+ lexer->inside_string = 1;
+ return ret;
+ } else if(ret > 0) {
+ lexer->inside_string = 0;
+ return 0;
+ }
+
+ return ret;
+}
+
+static int on_dot(lexer_t lexer)
+{
+ if(lexer->iden_sz != 0) return 1;
+ on_generic_separator(lexer, TOKEN_SPECIAL_DOT);
+}
+
+static int on_generic_separator(lexer_t lexer, enum token_enum type)
+{
+ if(lexer->inside_string) {
+ return 1;
+ }
+
+ if(save_current_identifier(lexer)) {
+ err("save_current_identifier: failed");
+ return -1;
+ }
+
+ if(type != TOKEN_TOKENS) {
+ if(save_empty_token(lexer, type) < 0) {
+ err("save_empty_token: failed");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int save_empty_token(lexer_t lexer, enum token_enum type)
+{
+ if(lexer->ntokens >= lexer->tokens_cap) {
+ err("tokens_cap of %ld has been reached", lexer->tokens_cap);
+ return -1;
+ }
+
+ lexer->tokens[lexer->ntokens++].type = type;
+ return lexer->ntokens - 1;
+}
+
+#define CHECK_IDEN(func) \
+ if((ret = func(lexer))) { \
+ if(ret < 0) { \
+ err(#func ": failed"); \
+ goto exit; \
+ } \
+ } else
+
+static int save_current_identifier(lexer_t lexer)
+{
+ int ret = 1;
+
+ if(lexer->iden_sz != 0) {
+ IDENTIFY_IDENTIFIER_LIST(CHECK_IDEN) {}
+ }
+
+ ret = 0;
+exit:
+ memset(lexer->iden, 0, lexer->iden_sz);
+ lexer->iden_sz = 0;
+ return ret;
+}
+
+
+// ------------------------------------------------- //
+
+static int try_str(lexer_t lexer)
+{
+ if(!lexer->inside_string) return 0;
+
+ int i = save_empty_token(lexer, TOKEN_LITERAL_STRING);
+ if(i < 0) {
+ err("save_empty_token: failed");
+ return -1;
+ }
+
+ lexer->tokens[i].string = malloc(lexer->iden_sz+1);
+ if(!lexer->tokens[i].string) {
+ err("malloc: %s", strerror(errno));
+ return -1;
+ }
+
+ memcpy(lexer->tokens[i].string, lexer->iden, lexer->iden_sz+1);
+ return 1;
+}
+
+static int try_int(lexer_t lexer)
+{
+ errno = ERANGE + 1; // set errno to not ERANGE
+
+ char *endptr;
+ long num = strtol(lexer->iden, &endptr, 10);
+
+ if(*endptr != '\0') { // the whole string isn't a number
+ return 0;
+ }
+
+ if(errno == ERANGE) {
+ warn("Given integer literal %s is outside the possible range", lexer->iden);
+ }
+
+ int i = save_empty_token(lexer, TOKEN_LITERAL_NUM_INT);
+ if(i < 0) {
+ err("save_empty_token: failed");
+ return -1;
+ }
+
+ lexer->tokens[i].num_int = num;
+ return 1;
+}
+
+static int try_float(lexer_t lexer)
+{
+ errno = ERANGE + 1; // set errno to not ERANGE
+
+ char *endptr;
+ float num = strtof(lexer->iden, &endptr);
+
+ if(*endptr != '\0') { // the whole string isn't a number
+ return 0;
+ }
+
+ if(errno == ERANGE) {
+ warn("Given float literal %s is outside the possible range", lexer->iden);
+ }
+
+ int i = save_empty_token(lexer, TOKEN_LITERAL_NUM_FLOAT);
+ if(i < 0) {
+ err("save_empty_token: failed");
+ return -1;
+ }
+
+ lexer->tokens[i].num_float = num;
+ return 1;
+}
+
+static int try_symbol(lexer_t lexer)
+{
+ int i = save_empty_token(lexer, TOKEN_SYMBOL);
+ if(i < 0) {
+ err("save_empty_token: failed");
+ return -1;
+ }
+
+ lexer->tokens[i].symbol = malloc(lexer->iden_sz+1);
+ if(!lexer->tokens[i].symbol) {
+ err("malloc: %s", strerror(errno));
+ return -1;
+ }
+
+ memcpy(lexer->tokens[i].symbol, lexer->iden, lexer->iden_sz+1);
+ return 1;
+}
+
+// ------------------------------------------------- //
+
+#ifdef DEBUG
+#define CASE_PRINT(type, data, ...) case type: info("\t" __VA_ARGS__); break;
+
+void lexer_print_tokens(lexer_t lexer)
+{
+ for(size_t i = 0; i < lexer->ntokens; i++) {
+ struct token *token = &lexer->tokens[i];
+
+ info("Token %zu: %d", i, token->type);
+
+ switch(token->type) {
+ TOKEN_TYPES_INFO(CASE_PRINT, token);
+ default: break;
+ }
+ }
+}
+#endif