commit 0fd66745c872749b992df0c9452af9543cb873da Author: sam Date: Fri Nov 15 19:37:01 2024 +1300 first commit diff --git a/.cache/clangd/index/codegen.h.B28082C1C1D042BA.idx b/.cache/clangd/index/codegen.h.B28082C1C1D042BA.idx new file mode 100644 index 0000000..83e81db Binary files /dev/null and b/.cache/clangd/index/codegen.h.B28082C1C1D042BA.idx differ diff --git a/.cache/clangd/index/compiler.h.D712E1B2AB94B380.idx b/.cache/clangd/index/compiler.h.D712E1B2AB94B380.idx new file mode 100644 index 0000000..1b46bdb Binary files /dev/null and b/.cache/clangd/index/compiler.h.D712E1B2AB94B380.idx differ diff --git a/.cache/clangd/index/helpers.h.F9575AB77341F585.idx b/.cache/clangd/index/helpers.h.F9575AB77341F585.idx new file mode 100644 index 0000000..85f6de1 Binary files /dev/null and b/.cache/clangd/index/helpers.h.F9575AB77341F585.idx differ diff --git a/.cache/clangd/index/lexer.h.4F141419C0AC5007.idx b/.cache/clangd/index/lexer.h.4F141419C0AC5007.idx new file mode 100644 index 0000000..9d36e6f Binary files /dev/null and b/.cache/clangd/index/lexer.h.4F141419C0AC5007.idx differ diff --git a/.cache/clangd/index/main.c.2393E5B60B02EEC0.idx b/.cache/clangd/index/main.c.2393E5B60B02EEC0.idx new file mode 100644 index 0000000..934b9c5 Binary files /dev/null and b/.cache/clangd/index/main.c.2393E5B60B02EEC0.idx differ diff --git a/.cache/clangd/index/node.h.714EF12F6C2AA1A8.idx b/.cache/clangd/index/node.h.714EF12F6C2AA1A8.idx new file mode 100644 index 0000000..2af2f28 Binary files /dev/null and b/.cache/clangd/index/node.h.714EF12F6C2AA1A8.idx differ diff --git a/.cache/clangd/index/parser.h.AB8FD292E884B1D7.idx b/.cache/clangd/index/parser.h.AB8FD292E884B1D7.idx new file mode 100644 index 0000000..d837d0a Binary files /dev/null and b/.cache/clangd/index/parser.h.AB8FD292E884B1D7.idx differ diff --git a/.cache/clangd/index/regex_helpers.h.468BB76668FECF71.idx b/.cache/clangd/index/regex_helpers.h.468BB76668FECF71.idx new file mode 100644 index 0000000..05f53b7 Binary files /dev/null and b/.cache/clangd/index/regex_helpers.h.468BB76668FECF71.idx differ diff --git a/.cache/clangd/index/token.h.06BA855CD31E4C24.idx b/.cache/clangd/index/token.h.06BA855CD31E4C24.idx new file mode 100644 index 0000000..cb65ff2 Binary files /dev/null and b/.cache/clangd/index/token.h.06BA855CD31E4C24.idx differ diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..c17e64a --- /dev/null +++ b/.clang-format @@ -0,0 +1,25 @@ +BasedOnStyle: WebKit +IndentWidth: 4 +TabWidth: 4 +UseTab: Never +AlignConsecutiveDeclarations: false +AlignConsecutiveAssignments: false +AlignTrailingComments: true +ColumnLimit: 105 +BreakBeforeBraces: Attach +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AllowShortFunctionsOnASingleLine: false +AllowShortLambdasOnASingleLine: false +PointerAlignment: Left +SpaceBeforeParens: Never +SpacesInParentheses: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: false +SpaceAfterCStyleCast: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeSquareBrackets: false +SpacesBeforeTrailingComments: 2 +PenaltyBreakAssignment: 1000 +NamespaceIndentation: All + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1521057 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Xmake cache +.xmake/ +build/ + +# MacOS Cache +.DS_Store + + diff --git a/compile_commands.json b/compile_commands.json new file mode 100644 index 0000000..9b90e80 --- /dev/null +++ b/compile_commands.json @@ -0,0 +1,6 @@ +[ +{ + "directory": "/home/sam/Documents/Projects/compiler-c", + "arguments": ["/usr/bin/gcc", "-c", "-fvisibility=hidden", "-O3", "-I", "/home/sam/.xmake/packages/s/stc/v4.2/bfec6d3335d54b48969cc50946a9b5ac/include", "-DNDEBUG", "-o", "build/.objs/compiler-c/linux/arm64/release/src/main.c.o", "src/main.c"], + "file": "src/main.c" +}] diff --git a/src/codegen.c b/src/codegen.c new file mode 100644 index 0000000..1ba2247 --- /dev/null +++ b/src/codegen.c @@ -0,0 +1,87 @@ +#include "codegen.h" +#include "helpers.h" +#include + +void codegen(Node node) { + switch(node.type) { + case NODE_FUNCTION_CALL: + codegen_function_call(node); + break; + case NODE_FUNCTION_DECL: + codegen_function_decl(node); + break; + case NODE_FUNCTION_IMPL: + codegen_function_impl(node); + break; + case NODE_ARG_DECL: + codegen_arg_decl(node); + break; + case NODE_NUMBER: + codegen_number(node); + break; + default: + syntax_error("unexpected node %c", node.type); + } +} + +void codegen_function_call(Node node) { + printf("FunctionCall(%s", node.function_call.name); + if(Nodes_size(&node.function_call.args) > 0) + printf(", "); + for(size_t i = 0; i < Nodes_size(&node.function_call.args); i++) { + const Node* arg = Nodes_at(&node.function_call.args, i); + codegen(*arg); + + if(arg != Nodes_back(&node.function_call.args)) { + printf(", "); + } + } + printf(")\n"); +} + +void codegen_function_decl(Node node) { + printf("FunctionDecl(%s, %s, ", node.function_decl.type, node.function_decl.name); + for(size_t i = 0; i < Nodes_size(&node.function_decl.args); i++) { + const Node* arg = Nodes_at(&node.function_decl.args, i); + codegen(*arg); + + if(arg != Nodes_back(&node.function_decl.args)) { + printf(", "); + } + } + printf(")\n"); +} + +void codegen_function_impl(Node node) { + printf("FunctionImpl(%s, %s", node.function_impl.type, node.function_impl.name); + if(Nodes_size(&node.function_impl.args) > 0) + printf(", "); + for(size_t i = 0; i < Nodes_size(&node.function_impl.args); i++) { + const Node* arg = Nodes_at(&node.function_impl.args, i); + codegen(*arg); + + if(arg != Nodes_back(&node.function_impl.args)) { + printf(", "); + } + } + printf(") {\n\t"); + + for(size_t i = 0; i < Nodes_size(&node.function_impl.body); i++) { + const Node* n = Nodes_at(&node.function_impl.body, i); + codegen(*n); + + if(n != Nodes_back(&node.function_impl.body)) { + printf("\t"); + } + } + + printf("}\n"); +} + +void codegen_arg_decl(Node node) { + printf("ArgDecl(%s, %s)", node.arg_decl.type, node.arg_decl.name); +} + +void codegen_number(Node node) { + printf("Number(%llu)", node.number.value); +} diff --git a/src/codegen.h b/src/codegen.h new file mode 100644 index 0000000..d133082 --- /dev/null +++ b/src/codegen.h @@ -0,0 +1,13 @@ +#ifndef CODEGEN_H +#define CODEGEN_H + +#include "node.h" + +void codegen(Node node); +void codegen_function_call(Node node); +void codegen_function_decl(Node node); +void codegen_arg_decl(Node node); +void codegen_function_impl(Node node); +void codegen_number(Node node); + +#endif diff --git a/src/compiler.c b/src/compiler.c new file mode 100644 index 0000000..561a55c --- /dev/null +++ b/src/compiler.c @@ -0,0 +1,21 @@ +#include "compiler.h" + +bool next(Compiler* compiler, Token* token) { + if(!lexer_next(compiler->lexer, token)) { + return false; + } + + switch(token->type) { + case TOKEN_WHITESPACE: + return next(compiler, token); + case TOKEN_IDENTIFIER: + if(cmap_str_contains(&compiler->types, token->value)) { + token->type = TOKEN_TYPEIDENTIFIER; + } + break; + } + + printf("tok: %c, val: %s\n", token->type, token->value); + + return true; +} diff --git a/src/compiler.h b/src/compiler.h new file mode 100644 index 0000000..d385f7b --- /dev/null +++ b/src/compiler.h @@ -0,0 +1,39 @@ +#ifndef COMPILER_H +#define COMPILER_H + +#include "lexer.h" + +typedef enum { + TOKEN_IDENTIFIER = 'I', + TOKEN_STRING = 'S', + TOKEN_NUMBER = 'N', + TOKEN_TYPEIDENTIFIER = 'T', + TOKEN_WHITESPACE = 'W', + TOKEN_OPAREN = '(', + TOKEN_CPAREN = ')', + TOKEN_OBRACE = '{', + TOKEN_CBRACE = '}', + TOKEN_SEMICOLON = ';', + TOKEN_COMMA = ',' +} TokenType; + +#include +#include + +#define i_key_str +#define i_val_str +#include + +#define i_type Tokens +#define i_val Token +#define i_opt c_no_cmp +#include + +typedef struct { + Lexer* lexer; + cmap_str types; +} Compiler; + +bool next(Compiler* compiler, Token* token); + +#endif diff --git a/src/helpers.c b/src/helpers.c new file mode 100644 index 0000000..05f6f9c --- /dev/null +++ b/src/helpers.c @@ -0,0 +1,44 @@ +#include "helpers.h" +#include +#include + +void regex_error(const regex_t* regex, int status) { + if(status > REG_NOMATCH) { + char error_msg[100]; + regerror(status, regex, error_msg, sizeof(error_msg)); + fprintf(stderr, "Regex compilation failed: %s\n", error_msg); + exit(EXIT_FAILURE); + } +} + +regex_t regex(const char* pattern) { + regex_t regex; + regex_error(®ex, regcomp(®ex, pattern, REG_EXTENDED)); + + return regex; +} + +bool regex_search(regmatch_t* match, const regex_t* regex, const char* string) { + int status = regexec(regex, string, 1, match, 0); + regex_error(regex, status); + + return !status; +} + +const char* read_file(const char* filename) { + FILE* file = fopen(filename, "r"); + if(file == NULL) { + perror("Failed to open file"); + exit(EXIT_FAILURE); + } + + fseek(file, 0, SEEK_END); + size_t size = ftell(file); + rewind(file); + + char* buffer = malloc(size + 1); + fread(buffer, size, 1, file); + fclose(file); + + return buffer; +} diff --git a/src/helpers.h b/src/helpers.h new file mode 100644 index 0000000..08c0133 --- /dev/null +++ b/src/helpers.h @@ -0,0 +1,20 @@ +#ifndef HELPERS_H +#define HELPERS_H + +#include +#include + +#define syntax_error(...) \ + { \ + fprintf(stderr, "Syntax error: " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(EXIT_FAILURE); \ + } + +void regex_error(const regex_t* regex, int status); +regex_t regex(const char* pattern); +bool regex_search(regmatch_t* match, const regex_t* regex, const char* string); + +const char* read_file(const char* filename); + +#endif diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..5a627df --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,51 @@ +#include "lexer.h" +#include "helpers.h" +#include +#include +#include +#include + +Lexer lexer_create(const TokenRule* rules, size_t num_rules) { + return (Lexer){ + .text = NULL, + .offset = 0, + .rules = rules, + .num_rules = num_rules, + }; +} + +void lexer_feed(Lexer* lexer, const char* text) { + assert(lexer != NULL); + lexer->text = text; +} + +bool lexer_next(Lexer* lexer, Token* token) { + assert(lexer != NULL); + assert(lexer->text != NULL); + + if(lexer->offset >= strlen(lexer->text)) { + return false; + } + + regmatch_t match; + for(int i = 0; i < lexer->num_rules; i++) { + TokenRule rule = lexer->rules[i]; + + if(regex_search(&match, &rule.regex, lexer->text + lexer->offset)) { + int length = match.rm_eo; + + char* slice = malloc(length); + strncpy(slice, lexer->text + lexer->offset, length); + slice[length] = '\0'; + + lexer->offset += length; + + token->type = rule.token_type; + token->value = slice; + return true; + } + } + + fprintf(stderr, "Unrecognized character: %c\n", *(lexer->text + lexer->offset)); + exit(EXIT_FAILURE); +} diff --git a/src/lexer.h b/src/lexer.h new file mode 100644 index 0000000..fec0d8f --- /dev/null +++ b/src/lexer.h @@ -0,0 +1,24 @@ +#ifndef LEXER_H +#define LEXER_H + +#include "token.h" +#include +#include + +typedef struct { + regex_t regex; + int token_type; +} TokenRule; + +typedef struct { + int offset; + const char* text; + const TokenRule* rules; + size_t num_rules; +} Lexer; + +Lexer lexer_create(const TokenRule* rules, size_t num_rules); +void lexer_feed(Lexer* lexer, const char* text); +bool lexer_next(Lexer* lexer, Token* token); + +#endif diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..a016488 --- /dev/null +++ b/src/main.c @@ -0,0 +1,51 @@ +#include "codegen.h" +#include "compiler.h" +#include "helpers.h" +#include "lexer.h" +#include "node.h" +#include "parser.h" +#include "token.h" +#include +#include +#include + +int main(int argc, char** argv) { + assert(argc > 1); + const char* text = read_file(argv[1]); + + TokenRule rules[] = { + { regex("^[A-Za-z][A-Za-z0-9]*"), TOKEN_IDENTIFIER }, + { regex("^[ \r\n]+"), TOKEN_WHITESPACE }, + { regex("^\".*?\""), TOKEN_STRING }, + { regex("^[0-9]+"), TOKEN_NUMBER }, + { regex("^;"), TOKEN_SEMICOLON }, + { regex("^\\("), TOKEN_OPAREN }, + { regex("^\\{"), TOKEN_OBRACE }, + { regex("^\\)"), TOKEN_CPAREN }, + { regex("^\\}"), TOKEN_CBRACE }, + }; + + Lexer lexer = lexer_create(rules, c_arraylen(rules)); + lexer_feed(&lexer, text); + + Compiler compiler = { + .lexer = &lexer, + .types = c_make(cmap_str, + { + { "i8", "b" }, + { "i16", "h" }, + { "i32", "w" }, + { "i64", "l" }, + { "void", "" }, + }), + }; + + Token token; + while(next(&compiler, &token)) { + // printf("%c: %s\n", token.type, token.value); + Node node = parse_token(&compiler, token); + codegen(node); + } + + return 0; +} diff --git a/src/node.h b/src/node.h new file mode 100644 index 0000000..d040b27 --- /dev/null +++ b/src/node.h @@ -0,0 +1,63 @@ +#ifndef NODE_H +#define NODE_H + +#include + +#include +typedef struct Node Node; +forward_cvec(Nodes, struct Node); + +typedef enum { + NODE_FUNCTION_CALL = 'C', + NODE_FUNCTION_DECL = 'D', + NODE_FUNCTION_IMPL = 'I', + NODE_ARG_DECL = 'A', + NODE_NUMBER = 'N', +} NodeType; + +typedef struct { + const char* name; + Nodes args; +} FunctionCall; + +typedef struct { + const char* type; + const char* name; + Nodes args; +} FunctionDecl; + +typedef struct { + const char* type; + const char* name; + Nodes args; + Nodes body; +} FunctionImpl; + +typedef struct { + const char* type; + const char* name; +} ArgDecl; + +typedef struct { + long long int value; +} Number; + +struct Nodes; +typedef struct Node { + NodeType type; + union { + FunctionCall function_call; + FunctionDecl function_decl; + FunctionImpl function_impl; + ArgDecl arg_decl; + Number number; + }; +} Node; + +#define i_type Nodes +#define i_is_forward +#define i_val Node +#define i_opt c_no_cmp +#include + +#endif diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..df0eee6 --- /dev/null +++ b/src/parser.c @@ -0,0 +1,164 @@ +#include "parser.h" +#include "helpers.h" + +Node parse_token(Compiler* self, Token token) { + switch(token.type) { + case TOKEN_TYPEIDENTIFIER: + return parse_type(self, token.value); + case TOKEN_IDENTIFIER: + return parse_identifier(self, token.value); + case TOKEN_NUMBER: + return parse_number(self, token.value); + default: + syntax_error("unexpected token \"%s\"", token.value); + } +} + +Nodes parse_until(Compiler* self, TokenType end_token) { + Nodes collected = { 0 }; + + Token token; + while(next(self, &token)) { + if(token.type == end_token) { + break; + } + + Nodes_push(&collected, parse_token(self, token)); + } + + return collected; +} + +Node parse_type(Compiler* self, const char* type) { + printf("parse type %s\n", type); + Token token; + next(self, &token); + + switch(token.type) { + case TOKEN_IDENTIFIER: + return parse_type_ident_pair(self, type, token.value); + break; + default: + syntax_error("unexpected token \"%s\" after type \"%s\"", token.value, type); + } +} + +Node parse_identifier(Compiler* self, const char* name) { + Token token; + next(self, &token); + + switch(token.type) { + case TOKEN_OPAREN: + return parse_function_call(self, name); + break; + default: + syntax_error("unexpected token \"%s\" after identifier \"%s\"", token.value, name); + } +} + +Node parse_number(Compiler* self, const char* value) { + return (Node){ + .type = NODE_NUMBER, + .number = atoll(value), + }; +} + +Node parse_type_ident_pair(Compiler* self, const char* type, const char* name) { + printf("parse type ident pair %s %s\n", type, name); + Token token; + next(self, &token); + + switch(token.type) { + case TOKEN_OPAREN: + return parse_function(self, type, name); + default: + syntax_error("unexpected token \"%s\" after \"%s %s\"", token.value, type, name); + } +} + +Node parse_function(Compiler* self, const char* type, const char* name) { + printf("parse function %s %s\n", type, name); + Nodes args = { 0 }; + while(true) { + Token arg_type; + next(self, &arg_type); + if(arg_type.type == TOKEN_CPAREN) { + break; + } + + Token arg_name; + next(self, &arg_name); + + Nodes_push(&args, + (Node){ + .type = NODE_ARG_DECL, + .arg_decl = { .type = arg_type.value, .name = arg_name.value }, + }); + + Token token; + next(self, &token); + switch(token.type) { + case TOKEN_COMMA: + continue; + case TOKEN_CPAREN: + break; + default: + syntax_error("expected comma or closing parenthesis, found %s", token.value); + } + + break; // only reached if didnt continue or error + } + + Token token; + if(next(self, &token)) { + switch(token.type) { + case TOKEN_SEMICOLON: + return parse_function_decl(self, type, name, args); + case TOKEN_OBRACE: + return parse_function_impl(self, type, name, args); + default: + syntax_error("expected semicolon or opening brace found, %s", token.value); + } + } else { + syntax_error("expected token, found eof"); + } +} + +Node parse_function_decl(Compiler* self, const char* type, const char* name, Nodes args) { + return (Node){ + .type = NODE_FUNCTION_DECL, + .function_decl = { .type = type, .name = name, .args = args }, + }; +} + +Node parse_function_impl(Compiler* self, const char* type, const char* name, Nodes args) { + Nodes body = parse_until(self, TOKEN_CBRACE); + + return (Node){ + .type = NODE_FUNCTION_IMPL, + .function_impl = { .type = type, .name = name, .args = args, .body = body }, + }; +} + +Node parse_function_call(Compiler* self, const char* name) { + printf("parse function call %s\n", name); + Nodes args = parse_until(self, TOKEN_CPAREN); + + parse_semicolon(self); + return (Node){ + .type = NODE_FUNCTION_CALL, + .function_call = { .name = name, .args = args }, + }; +} + +void parse_semicolon(Compiler* self) { + Token token; + next(self, &token); + + switch(token.type) { + case TOKEN_SEMICOLON: + return; + default: + syntax_error("expected semicolon, found \"%s\"", token.value); + } +} diff --git a/src/parser.h b/src/parser.h new file mode 100644 index 0000000..5828ea9 --- /dev/null +++ b/src/parser.h @@ -0,0 +1,21 @@ +#ifndef PARSER_H +#define PARSER_H + +#include "compiler.h" +#include "node.h" +#include "token.h" + +Node parse_token(Compiler* self, Token token); +Nodes parse_until(Compiler* self, TokenType end_token); +Node parse_type(Compiler* self, const char* type); +Node parse_identifier(Compiler* self, const char* name); +Node parse_number(Compiler* self, const char* value); +Node parse_type_ident_pair(Compiler* self, const char* type, const char* name); +Node parse_arg_decl(Compiler* self, const char* type, const char* name); +Node parse_function(Compiler* self, const char* type, const char* name); +Node parse_function_decl(Compiler* self, const char* type, const char* name, Nodes args); +Node parse_function_impl(Compiler* self, const char* type, const char* name, Nodes args); +Node parse_function_call(Compiler* self, const char* name); +void parse_semicolon(Compiler* self); + +#endif diff --git a/src/token.h b/src/token.h new file mode 100644 index 0000000..bd5ee4e --- /dev/null +++ b/src/token.h @@ -0,0 +1,9 @@ +#ifndef TOKEN_H +#define TOKEN_H + +typedef struct { + int type; + const char* value; +} Token; + +#endif diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..6afcd8a --- /dev/null +++ b/test.txt @@ -0,0 +1,10 @@ +hey "bob \"mr goonman\" gooner" + +i32 putchar(i32 char); + +void sayhi() { + putchar(72); + putchar(105); +} + +sayhi(); diff --git a/xmake.lua b/xmake.lua new file mode 100644 index 0000000..eae5da5 --- /dev/null +++ b/xmake.lua @@ -0,0 +1,79 @@ +add_rules("mode.debug", "mode.release") + +add_requires("stc") + +target("compiler-c") + set_kind("binary") + add_files("src/*.c") + add_packages("stc") + set_rundir(".") + +-- +-- If you want to known more usage about xmake, please see https://xmake.io +-- +-- ## FAQ +-- +-- You can enter the project directory firstly before building project. +-- +-- $ cd projectdir +-- +-- 1. How to build project? +-- +-- $ xmake +-- +-- 2. How to configure project? +-- +-- $ xmake f -p [macosx|linux|iphoneos ..] -a [x86_64|i386|arm64 ..] -m [debug|release] +-- +-- 3. Where is the build output directory? +-- +-- The default output directory is `./build` and you can configure the output directory. +-- +-- $ xmake f -o outputdir +-- $ xmake +-- +-- 4. How to run and debug target after building project? +-- +-- $ xmake run [targetname] +-- $ xmake run -d [targetname] +-- +-- 5. How to install target to the system directory or other output directory? +-- +-- $ xmake install +-- $ xmake install -o installdir +-- +-- 6. Add some frequently-used compilation flags in xmake.lua +-- +-- @code +-- -- add debug and release modes +-- add_rules("mode.debug", "mode.release") +-- +-- -- add macro definition +-- add_defines("NDEBUG", "_GNU_SOURCE=1") +-- +-- -- set warning all as error +-- set_warnings("all", "error") +-- +-- -- set language: c99, c++11 +-- set_languages("c99", "c++11") +-- +-- -- set optimization: none, faster, fastest, smallest +-- set_optimize("fastest") +-- +-- -- add include search directories +-- add_includedirs("/usr/include", "/usr/local/include") +-- +-- -- add link libraries and search directories +-- add_links("tbox") +-- add_linkdirs("/usr/local/lib", "/usr/lib") +-- +-- -- add system link libraries +-- add_syslinks("z", "pthread") +-- +-- -- add compilation and link flags +-- add_cxflags("-stdnolib", "-fno-strict-aliasing") +-- add_ldflags("-L/usr/local/lib", "-lpthread", {force = true}) +-- +-- @endcode +-- +