diff options
author | kartofen <mladenovnasko0@gmail.com> | 2023-06-17 23:42:31 +0300 |
---|---|---|
committer | kartofen <mladenovnasko0@gmail.com> | 2023-06-17 23:42:31 +0300 |
commit | bcac686c1bf6a5c1dec2324269e2766babdc0fde (patch) | |
tree | 6483461015705efa8290a1ab05482a641739c1dd |
lexer - done
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Makefile | 48 | ||||
-rw-r--r-- | files/test1.lisp | 3 | ||||
-rw-r--r-- | src/ast.h | 38 | ||||
-rw-r--r-- | src/common.h | 20 | ||||
-rw-r--r-- | src/eval.h | 22 | ||||
-rw-r--r-- | src/lexer.c | 330 | ||||
-rw-r--r-- | src/lexer.h | 55 | ||||
-rw-r--r-- | src/main.c | 77 |
9 files changed, 595 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cbbd0b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +bin/ +obj/
\ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0cbcaeb --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +CC := gcc + +ifeq ($(PROD),1) +CFLAGS := -std=c99 -O2 # production flags +else +CFLAGS := -std=c99 -Wall -Wextra -Wpedantic -g -DDEBUG # debug flags +endif + +SRCD := src +OBJD := obj +BIND := bin +TESTD := tests + +FILES = $(shell find $(SRCD)/ -type f 2> /dev/null) +CSRCS = $(filter %.c, $(FILES)) +COBJS = $(CSRCS:$(SRCD)/%.c=$(OBJD)/%.o) + +CDEPS = $(COBJS:%.o=%.d) +-include $(CDEPS) + +.PHONY: all clean lispy + +all: lispy +lispy: $(BIND)/lispy + +clean: + rm -rf $(BIND) + rm -rf $(OBJD) + +$(OBJD)/%.o: $(SRCD)/%.c + mkdir -p $(dir $@) + $(CC) $(CFLAGS) -MMD -MF $(@:%.o=%.d) -c $< -o $@ + +$(BIND)/%: $(COBJS) + mkdir -p $(dir $@) + $(CC) $(CFLAGS) $^ -o $@ + +analyze: clean + scan-build \ + -enable-checker alpha \ + -enable-checker core \ + -enable-checker deadcode \ + -enable-checker security \ + -enable-checker unix \ + make + +leak: lispy + valgrind -s --leak-check=full $(BIND)/lispy diff --git a/files/test1.lisp b/files/test1.lisp new file mode 100644 index 0000000..f0d6ffe --- /dev/null +++ b/files/test1.lisp @@ -0,0 +1,3 @@ +(define 'a 1) +(+ a 1) +(+ a 0.1) diff --git a/src/ast.h b/src/ast.h new file mode 100644 index 0000000..bd2e628 --- /dev/null +++ b/src/ast.h @@ -0,0 +1,38 @@ +#ifndef AST_H +#define AST_H + +#include "lexer.h" + +typedef struct node_t *ast_t; +struct ast_node { + enum { + NODE_SEXP, + NODE_SYMBOL, + NODE_LITERAL, + } type; + + union { + struct sexp { + struct ast_node **children; + size_t nchildren; + } sexp; + + char *symbol; + + union { + enum { + NODE_LITERAL_NUM, + NODE_LITERAL_STR, + } type; + + int number; + char *string; + } literal; + }; +}; + +ast_t ast_create(); +void ast_destroy(ast_t ast); +int ast_parse_lexer(ast_t ast, lexer_t lex); + +#endif diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..a1daa03 --- /dev/null +++ b/src/common.h @@ -0,0 +1,20 @@ +#ifndef COMMON_H +#define COMMON_H + +#include <stdio.h> + +#define __RED__ "\033[0;31m" +#define __GREEN__ "\033[0;32m" +#define __YELLOW__ "\033[0;33m" +#define __RESET__ "\033[0m" + +#define STR(x) #x +#define XSTR(x) STR(x) + +#define info(...) do { fprintf(stdout, __GREEN__"[INFO]"__RESET__" "__VA_ARGS__); fprintf(stdout, "\n"); } while(0) +#define err(...) do { fprintf(stderr, __RED__"[ERROR]"__RESET__" "__FILE__":"XSTR(__LINE__)": "__VA_ARGS__); fprintf(stderr, "\n"); }while(0) +#define warn(...) do { fprintf(stderr, __YELLOW__"[WARN]"__RESET__" "__FILE__":"XSTR(__LINE__)": "__VA_ARGS__); fprintf(stderr, "\n"); }while(0) +// #define info(...) printf(__VA_ARGS__); +// #define err(...) printf(__VA_ARGS__); + +#endif diff --git a/src/eval.h b/src/eval.h new file mode 100644 index 0000000..01ed827 --- /dev/null +++ b/src/eval.h @@ -0,0 +1,22 @@ +#ifndef EVAL_H +#define EVAL_H + +#include "ast.h" + +typedef struct eval *eval_t; + +// RunTime Tree +struct rtt { + +}; + +struct eval { + struct rtt *root; +}; + +// TODO: add options for the evaluation +eval_t evaluator_create(); +void evaluator_destroy(eval_t evaluator); +int evaluator_eval_ast(eval_t evaluator, ast_t ast) + +#endif diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..1acfd6d --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,330 @@ +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <errno.h> + +// TODO: handle escaped quotes +#include "common.h" +#include "lexer.h" + +// saves a token with no data +// returns the index of the saved token; < 0 on fail +static int save_empty_token(lexer_t lexer, enum token_enum type); + +// saves a token with data which is the current identifier (lexer->iden) +// returns 0 on success +static int save_current_identifier(lexer_t lexer); + +// used for tokens that separate things, type is optional (TOKEN_TOKENS for default) +// returns 0 on success, < 0 on fail, and > 0 to skip the token (add it in iden) +static int on_generic_separator(lexer_t lexer, enum token_enum type); +static int on_quote(lexer_t lexer); +static int on_dot(lexer_t lexer); + +// try to convert the identifier (lexer->iden) to a given type +// returns > 0 on sucess, 0 on fail (iden isnt the given type), +// and < 0 on error +static int try_str(lexer_t lexer); +static int try_int(lexer_t lexer); +static int try_float(lexer_t lexer); +static int try_symbol(lexer_t lexer); + +#define SEPARATOR_CALLBACK_TBL(X, lexer) \ + X(EQ('('), on_generic_separator(lexer, TOKEN_PARENTHS_OPEN)) \ + X(EQ(')'), on_generic_separator(lexer, TOKEN_PARENTHS_CLOSE)) \ + X(EQ('\''), on_generic_separator(lexer, TOKEN_SPECIAL_QUOTE)) \ + X(EQ('.'), on_dot(lexer)) \ + X(EQ('"'), on_quote(lexer)) \ + X(FN(isspace), on_generic_separator(lexer, TOKEN_TOKENS)) + +#define IDENTIFY_IDENTIFIER_LIST(X) \ + X(try_str) \ + X(try_int) \ + X(try_float) \ + X(try_symbol) + +// X(token type, what to free, how to print on screen) +#define TOKEN_TYPES_INFO(X, token) \ + X(TOKEN_PARENTHS_OPEN, NULL, "'('") \ + X(TOKEN_PARENTHS_CLOSE, NULL, "')'") \ + X(TOKEN_LITERAL_STRING, token->string, "%s", token->string) \ + X(TOKEN_LITERAL_NUM_INT, NULL, "%ld", token->num_int) \ + X(TOKEN_LITERAL_NUM_FLOAT, NULL, "%f", token->num_float) \ + X(TOKEN_SYMBOL, token->symbol, "%s", token->symbol) \ + +#define EQ(ch) ch == +#define FN(f) f + +// makes an if-else chain to test the character +// agains the seperator callback table +#define CHECK_SEPERATOR_AND_CALLBACK(test_func, callback) \ + if(test_func(str[i])) { \ + callback_ret = callback; \ + if(callback_ret == 0) { \ + continue; \ + } else if(callback_ret < 0) { \ + err(#callback ": failed"); \ + return 1; \ + } \ + } else + +int lexer_tokenize(lexer_t lexer, char *str, size_t len) +{ + int callback_ret = 0; + + for(size_t i = 0; i < len; i++) + { + SEPARATOR_CALLBACK_TBL(CHECK_SEPERATOR_AND_CALLBACK, lexer) {} + + if(lexer->iden_sz >= LEXER_IDEN_CAP - 1) { // -1 to be null-terminated + err("LEXER_IDEN_CAP of %ld reached", lexer->iden_sz); + return 1; + } + + // add charater to identifier + lexer->iden[lexer->iden_sz++] = str[i]; + } + + return 0; +} + +lexer_t lexer_create(size_t tokens_cap) +{ + lexer_t lexer = malloc(sizeof(struct lexer)); + if(!lexer) { + err("malloc: %s", strerror(errno)); + goto fail; + } + + lexer->tokens = calloc(tokens_cap, sizeof(struct token)); + if(!lexer->tokens) { + err("malloc %s", strerror(errno)); + goto fail; + } + + for(size_t i = 0; i < tokens_cap; i++) { + lexer->tokens[i].symbol = NULL; + } + + lexer->tokens_cap = tokens_cap; + lexer->ntokens = 0; + + memset(lexer->iden, 0, LEXER_IDEN_CAP); + lexer->iden_sz = 0; + + lexer->inside_string = 0; + + return lexer; +fail: + lexer_destroy(lexer); + return NULL; +} + +#define CASE_FREE_TOKEN(type, data, ...) \ + case type: if(data != NULL) { free(data); } break; + +void lexer_destroy(lexer_t lexer) +{ + if(!lexer) return; + + if(lexer->tokens) { + for(size_t i = 0; i < lexer->ntokens; i++) { + struct token *token = &lexer->tokens[i]; + switch(token->type) { + TOKEN_TYPES_INFO(CASE_FREE_TOKEN, token) + default: break; + } + } + free(lexer->tokens); + } + + free(lexer); +} + +// ------------------------------------------------- // + +static int on_quote(lexer_t lexer) +{ + int ret = on_generic_separator(lexer, TOKEN_TOKENS); + if(ret == 0) { + lexer->inside_string = 1; + return ret; + } else if(ret > 0) { + lexer->inside_string = 0; + return 0; + } + + return ret; +} + +static int on_dot(lexer_t lexer) +{ + if(lexer->iden_sz != 0) return 1; + on_generic_separator(lexer, TOKEN_SPECIAL_DOT); +} + +static int on_generic_separator(lexer_t lexer, enum token_enum type) +{ + if(lexer->inside_string) { + return 1; + } + + if(save_current_identifier(lexer)) { + err("save_current_identifier: failed"); + return -1; + } + + if(type != TOKEN_TOKENS) { + if(save_empty_token(lexer, type) < 0) { + err("save_empty_token: failed"); + return -1; + } + } + + return 0; +} + +static int save_empty_token(lexer_t lexer, enum token_enum type) +{ + if(lexer->ntokens >= lexer->tokens_cap) { + err("tokens_cap of %ld has been reached", lexer->tokens_cap); + return -1; + } + + lexer->tokens[lexer->ntokens++].type = type; + return lexer->ntokens - 1; +} + +#define CHECK_IDEN(func) \ + if((ret = func(lexer))) { \ + if(ret < 0) { \ + err(#func ": failed"); \ + goto exit; \ + } \ + } else + +static int save_current_identifier(lexer_t lexer) +{ + int ret = 1; + + if(lexer->iden_sz != 0) { + IDENTIFY_IDENTIFIER_LIST(CHECK_IDEN) {} + } + + ret = 0; +exit: + memset(lexer->iden, 0, lexer->iden_sz); + lexer->iden_sz = 0; + return ret; +} + + +// ------------------------------------------------- // + +static int try_str(lexer_t lexer) +{ + if(!lexer->inside_string) return 0; + + int i = save_empty_token(lexer, TOKEN_LITERAL_STRING); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].string = malloc(lexer->iden_sz+1); + if(!lexer->tokens[i].string) { + err("malloc: %s", strerror(errno)); + return -1; + } + + memcpy(lexer->tokens[i].string, lexer->iden, lexer->iden_sz+1); + return 1; +} + +static int try_int(lexer_t lexer) +{ + errno = ERANGE + 1; // set errno to not ERANGE + + char *endptr; + long num = strtol(lexer->iden, &endptr, 10); + + if(*endptr != '\0') { // the whole string isn't a number + return 0; + } + + if(errno == ERANGE) { + warn("Given integer literal %s is outside the possible range", lexer->iden); + } + + int i = save_empty_token(lexer, TOKEN_LITERAL_NUM_INT); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].num_int = num; + return 1; +} + +static int try_float(lexer_t lexer) +{ + errno = ERANGE + 1; // set errno to not ERANGE + + char *endptr; + float num = strtof(lexer->iden, &endptr); + + if(*endptr != '\0') { // the whole string isn't a number + return 0; + } + + if(errno == ERANGE) { + warn("Given float literal %s is outside the possible range", lexer->iden); + } + + int i = save_empty_token(lexer, TOKEN_LITERAL_NUM_FLOAT); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].num_float = num; + return 1; +} + +static int try_symbol(lexer_t lexer) +{ + int i = save_empty_token(lexer, TOKEN_SYMBOL); + if(i < 0) { + err("save_empty_token: failed"); + return -1; + } + + lexer->tokens[i].symbol = malloc(lexer->iden_sz+1); + if(!lexer->tokens[i].symbol) { + err("malloc: %s", strerror(errno)); + return -1; + } + + memcpy(lexer->tokens[i].symbol, lexer->iden, lexer->iden_sz+1); + return 1; +} + +// ------------------------------------------------- // + +#ifdef DEBUG +#define CASE_PRINT(type, data, ...) case type: info("\t" __VA_ARGS__); break; + +void lexer_print_tokens(lexer_t lexer) +{ + for(size_t i = 0; i < lexer->ntokens; i++) { + struct token *token = &lexer->tokens[i]; + + info("Token %zu: %d", i, token->type); + + switch(token->type) { + TOKEN_TYPES_INFO(CASE_PRINT, token); + default: break; + } + } +} +#endif diff --git a/src/lexer.h b/src/lexer.h new file mode 100644 index 0000000..942be54 --- /dev/null +++ b/src/lexer.h @@ -0,0 +1,55 @@ +#ifndef LEXER_H +#define LEXER_H + +#ifndef LEXER_IDEN_CAP +#define LEXER_IDEN_CAP 512 +#endif + +typedef struct lexer *lexer_t; + +struct token { + enum token_enum { + TOKEN_PARENTHS_OPEN, TOKEN_PARENTHS_CLOSE, + TOKEN_SPECIAL_DOT, TOKEN_SPECIAL_QUOTE, + TOKEN_LITERAL_NUM_INT, TOKEN_LITERAL_STRING, + TOKEN_LITERAL_NUM_FLOAT, TOKEN_SYMBOL, + TOKEN_TOKENS // number of token types + } type; + + union { + char *symbol; + char *string; + long num_int; + float num_float + }; +}; + +struct lexer { + struct token *tokens; + size_t tokens_cap; + size_t ntokens; + + // identifier + char iden[LEXER_IDEN_CAP]; + size_t iden_sz; + + int inside_string; +}; + +// allocate a lexer with a maximum number of tokens_cap tokens +// returns a lexer on success, NULL on fail +lexer_t lexer_create(size_t tokens_cap); + +// destroy a lexer +void lexer_destroy(lexer_t lexer); + +// turn the given non-null-terminated string str of lenght len +// into into tokens +// returns 0 on success +int lexer_tokenize(lexer_t lexer, char *str, size_t len); + +#ifdef DEBUG +void lexer_print_tokens(lexer_t lexer); +#endif + +#endif diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..992e594 --- /dev/null +++ b/src/main.c @@ -0,0 +1,77 @@ +#include <stdio.h> +#include <string.h> +#include <errno.h> + +#include "common.h" +#include "lexer.h" +// #include "ast.h" +// #include "eval.h" + +#define READ_BUF_CAP 512 +#define DEFAULT_TOKENS_CAP 8192 // make it a command line arg + +lexer_t lexer = NULL; +// ast_t root = NULL; +// eval_t evaluator = NULL; + +int main(void) +{ + int ret = 1; + + char *filename = "files/test1.lisp"; + + lexer = lexer_create(DEFAULT_TOKENS_CAP); + if(!lexer) { + err("lexer_create: failed"); + goto fail; + } + + // tokenize input + FILE *fp = fopen(filename, "r"); + if(!fp) { + err("fopen: %s: %s", filename, strerror(errno)); + goto fail; + } + + char buf[READ_BUF_CAP]; size_t bytes = 0; + while((bytes = fread(buf, sizeof(char), READ_BUF_CAP, fp))) { + if(lexer_tokenize(lexer, buf, bytes)) { + fclose(fp); goto fail; + } + + if(bytes < READ_BUF_CAP) break; + } + + fclose(fp); + lexer_print_tokens(lexer); + // -------------- + +// ast = ast_create(); +// if(!ast) { +// err("ast_create: failed"); +// goto fail; +// } + +// if(ast_parse_lexer(ast, lexer)) { +// err("ast_parse_lexer: failed"); +// goto fail; +// } + +// evaluator = evaluator_create(); +// if(!evaluator) { +// err("evaluator_create: failed"); +// goto fail; +// } + +// if(evaluator_eval_ast(evaluator, ast)) { +// err("evaluator_eval_ast: failed"); +// goto fail; +// } + + ret = 0; +fail: +// evaluator_destroy(eval); +// ast_destroy(ast); + lexer_destroy(lexer); + return ret; +} |