From 5b87ec6ef2b84f1d319e5376bcf9eedea9829d79 Mon Sep 17 00:00:00 2001 From: Mustafa Quraish Date: Sun, 30 Jan 2022 01:19:54 -0500 Subject: Rename `cup` directory to `src` --- compile.sh | 2 +- cup/ast.c | 161 ---------------- cup/ast.h | 150 --------------- cup/common.h | 6 - cup/generator.c | 355 ----------------------------------- cup/generator.h | 6 - cup/lexer.c | 238 ------------------------ cup/lexer.h | 19 -- cup/main.c | 82 --------- cup/parser.c | 562 -------------------------------------------------------- cup/parser.h | 7 - cup/tokens.c | 70 ------- cup/tokens.h | 83 --------- cup/utils.c | 30 --- cup/utils.h | 12 -- src/ast.c | 161 ++++++++++++++++ src/ast.h | 150 +++++++++++++++ src/common.h | 6 + src/generator.c | 355 +++++++++++++++++++++++++++++++++++ src/generator.h | 6 + src/lexer.c | 238 ++++++++++++++++++++++++ src/lexer.h | 19 ++ src/main.c | 82 +++++++++ src/parser.c | 562 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/parser.h | 7 + src/tokens.c | 70 +++++++ src/tokens.h | 83 +++++++++ src/utils.c | 30 +++ src/utils.h | 12 ++ 29 files changed, 1782 insertions(+), 1782 deletions(-) delete mode 100644 cup/ast.c delete mode 100644 cup/ast.h delete mode 100644 cup/common.h delete mode 100644 cup/generator.c delete mode 100644 cup/generator.h delete mode 100644 cup/lexer.c delete mode 100644 cup/lexer.h delete mode 100644 cup/main.c delete mode 100644 cup/parser.c delete mode 100644 cup/parser.h delete mode 100644 cup/tokens.c delete mode 100644 cup/tokens.h delete mode 100644 cup/utils.c delete mode 100644 cup/utils.h create mode 100644 src/ast.c create mode 100644 src/ast.h create mode 100644 src/common.h create mode 100644 src/generator.c create mode 100644 src/generator.h create mode 100644 src/lexer.c create mode 100644 src/lexer.h create mode 100644 src/main.c create mode 100644 src/parser.c create mode 100644 src/parser.h create mode 100644 src/tokens.c create mode 100644 src/tokens.h create mode 100644 src/utils.c create mode 100644 src/utils.h diff --git a/compile.sh b/compile.sh index 9568907..fa24474 100755 --- a/compile.sh +++ b/compile.sh @@ -2,7 +2,7 @@ CC=gcc CFLAGS="-Wall -Wextra -Werror -ggdb3" -SRCS=cup/*.c +SRCS=src/*.c set -xe diff --git a/cup/ast.c b/cup/ast.c deleted file mode 100644 index 6e33de1..0000000 --- a/cup/ast.c +++ /dev/null @@ -1,161 +0,0 @@ -#include "ast.h" -#include -#include - -char *data_type_to_str(DataType type) -{ - switch (type) - { - case TYPE_NONE: return "void"; - case TYPE_INT: return "int"; - default: assert(false && "Unreachable"); - } -} - -void print_type_to_file(FILE *out, Type type) -{ - fprintf(out, "%s", data_type_to_str(type.type)); - for (int i = 0; i < type.indirection; i++) { - fprintf(out, "*"); - } -} - -char *node_type_to_str(NodeType type) -{ - switch (type) - { - #define ENUM_TOKEN(t, name) case t: return name; - ENUM_AST_TYPES(ENUM_TOKEN) - #undef ENUM_TOKEN - default: assert(false && "Unreachable"); - } -} - -bool is_binary_op(NodeType type) -{ - switch (type) - { - case OP_PLUS: - case OP_MINUS: - case OP_MUL: - case OP_DIV: - case OP_MOD: - case OP_LSHIFT: - case OP_RSHIFT: - case OP_AND: - case OP_OR: - case OP_XOR: - case OP_EQ: - case OP_NEQ: - case OP_LT: - case OP_LEQ: - case OP_GT: - case OP_GEQ: - return true; - default: return false; - } -} - -bool is_unary_op(NodeType type) -{ - switch (type) - { - case OP_NEG: - case OP_NOT: - case OP_BWINV: - return true; - default: return false; - } -} - -bool is_expression(NodeType type) -{ - if (is_unary_op(type) || is_binary_op(type)) - return true; - return type == AST_LITERAL; -} - -void dump_func(Node *, int); - -static void do_print_ast(Node *node, int depth) -{ - for (int i = 0; i < depth; i++) { - printf(" "); - } - if (node->type == AST_PROGRAM) { - for (int i = 0; i < node->block.num_children; i++) { - do_print_ast(node->block.children[i], depth); - } - } else if (node->type == AST_BLOCK) { - printf("{\n"); - for (int i = 0; i < node->block.num_children; i++) { - do_print_ast(node->block.children[i], depth + 1); - } - for (int i = 0; i < depth; i++) { - printf(" "); - } - printf("}\n"); - } else if (node->type == AST_FUNC) { - dump_func(node, depth); - } else if (node->type == AST_LITERAL) { - printf("%d\n", node->literal.as_int); - } else if (node->type == AST_RETURN) { - printf("return\n"); - do_print_ast(node->unary_expr, depth + 1); - } else if (is_unary_op(node->type)) { - printf("%s\n", node_type_to_str(node->type)); - do_print_ast(node->unary_expr, depth + 1); - } else if (is_binary_op(node->type)) { - printf("%s\n", node_type_to_str(node->type)); - do_print_ast(node->binary.left, depth + 1); - do_print_ast(node->binary.right, depth + 1); - } else if (node->type == AST_VAR) { - assert(node->variable && node->variable->name); - printf("%s\n", node->variable->name); - } else if (node->type == AST_VARDECL) { - printf("var %s (", node->var_decl.var.name); - print_type_to_file(stdout, node->var_decl.var.type); - printf(")"); - if (node->var_decl.value != NULL) { - printf(" = \n"); - do_print_ast(node->var_decl.value, depth + 1); - } else { - printf("\n"); - } - } else if (node->type == AST_FUNCCALL) { - printf("CALL %s(\n", node->call.func->func.name); - for (int i = 0; i < node->call.num_args; i++) { - do_print_ast(node->call.args[i], depth + 1); - } - for (int i = 0; i < depth; i++) { - printf(" "); - } - printf(")\n"); - } else { - printf("{{ %s }}\n", node_type_to_str(node->type)); - } -} - -void dump_func(Node *node, int depth) -{ - printf("fn %s(", node->func.name); - for (int i = 0; i < node->func.num_args; i++) { - if (i > 0) printf(", "); - printf("%s: ", node->func.args[i].name); - print_type_to_file(stdout, node->func.args[i].type); - printf("[[%lld]]", node->func.args[i].offset); - } - printf(")"); - if (node->func.return_type.type != TYPE_NONE) { - // FIXME: Print return type properly - printf(" -> "); - print_type_to_file(stdout, node->func.return_type); - } - - do_print_ast(node->func.body, depth + 1); -} - -void print_ast(Node *node) -{ - do_print_ast(node, 0); -} \ No newline at end of file diff --git a/cup/ast.h b/cup/ast.h deleted file mode 100644 index 22a7283..0000000 --- a/cup/ast.h +++ /dev/null @@ -1,150 +0,0 @@ -#pragma once - -#include "common.h" - -#define ENUM_AST_TYPES(F) \ - F(OP_NEG, "neg") \ - F(OP_NOT, "!") \ - F(OP_BWINV, "~") \ - F(OP_PLUS, "+") \ - F(OP_MINUS, "-") \ - F(OP_MUL, "*") \ - F(OP_DIV, "/") \ - F(OP_MOD, "%") \ - F(OP_LSHIFT, "<<") \ - F(OP_RSHIFT, ">>") \ - F(OP_AND, "&&") \ - F(OP_OR, "||") \ - F(OP_XOR, "^") \ - F(OP_EQ, "==") \ - F(OP_NEQ, "!=") \ - F(OP_LT, "<") \ - F(OP_LEQ, "<=") \ - F(OP_GT, ">") \ - F(OP_GEQ, ">=") \ - F(OP_ASSIGN, "=") \ - F(AST_LITERAL, "literal") \ - F(AST_FUNCCALL, "Function call") \ - F(AST_CONDITIONAL, "conditional expression") \ - F(AST_IF, "if statement") \ - F(AST_WHILE, "while statement") \ - F(AST_FOR, "for statement") \ - F(AST_VARDECL, "variable decl") \ - F(AST_VAR, "variable") \ - F(AST_RETURN, "return") \ - F(AST_FUNC, "func") \ - F(AST_PROGRAM, "program") \ - F(AST_BLOCK, "block statements") - -typedef enum { -#define DEFINE_ENUM(name, str) name, - ENUM_AST_TYPES(DEFINE_ENUM) -#undef DEFINE_ENUM -} NodeType; - -char *node_type_to_str(NodeType type); - -bool is_binary_op(NodeType type); -bool is_unary_op(NodeType type); -bool is_expression(NodeType type); - -typedef enum { - TYPE_NONE, - TYPE_INT, -} DataType; - -char *data_type_to_str(DataType type); - -typedef struct { - DataType type; - // 0 = value, 1 = pointer, 2 = double pointer, ... - int indirection; -} Type; - -typedef struct { - char *name; - Type type; - i64 offset; -} Variable; - -typedef struct ast_node Node; -typedef struct ast_node { - NodeType type; - - union { - // Binary expr - struct { - Node *left; - Node *right; - } binary; - - // Unary expr - Node *unary_expr; - - // Function definition - struct { - char *name; - Type return_type; - Node *body; - - // TODO: Should we just dynamically allocate space on the - // stack for each block instead of storing this? - i64 max_locals_size; - - // TODO: Arguments / etc? - Variable *args; - int num_args; - } func; - - // Block of statements - struct { - Node **children; - int num_children; - - Variable **locals; - int num_locals; - i64 locals_size; - } block; - - struct { - Type type; - union { - int as_int; - }; - } literal; - - struct { - Variable var; - Node *value; - } var_decl; - - struct { - Variable *var; - Node *value; - } assign; - - struct { - Node *cond; - Node *do_then; - Node *do_else; - } conditional; - - // Used for all loops - struct { - Node *cond; - Node *init; - Node *step; - Node *body; - } loop; - - Variable *variable; - - struct { - Node *func; - Node **args; - int num_args; - } call; - }; -} Node; - -void print_ast(Node *node); \ No newline at end of file diff --git a/cup/common.h b/cup/common.h deleted file mode 100644 index 20d057f..0000000 --- a/cup/common.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include -#include - -typedef long long int i64; diff --git a/cup/generator.c b/cup/generator.c deleted file mode 100644 index 6a95d0c..0000000 --- a/cup/generator.c +++ /dev/null @@ -1,355 +0,0 @@ -/** - * This file generates x86-64 assembly code in NASM format from the parsed AST. - */ - -#include "generator.h" -#include -#include -#include - -#include - -static int label_counter = 0; -static Node *current_function = NULL; - -void generate_expr_into_rax(Node *expr, FILE *out); - -void generate_func_call(Node *node, FILE *out) -{ - assert(node->type == AST_FUNCCALL); - // FIXME: This seems like a big hack - i64 total_size = 0; - for (int i = node->call.num_args - 1; i >= 0; i--) { - Node *arg = node->call.args[i]; - generate_expr_into_rax(arg, out); - fprintf(out, " push rax\n"); - // TODO: Compute this for different types - // TODO: Also make sure of padding and stuff? - total_size += 8; - } - fprintf(out, " call %s\n", node->call.func->func.name); - fprintf(out, " add rsp, %lld\n", total_size); -} - -// The evaluated expression is stored into `rax` -void generate_expr_into_rax(Node *expr, FILE *out) -{ - // TODO: Different sized output for different types? - if (expr->type == AST_LITERAL) { - // TODO: More literal types - assert(expr->literal.type.type == TYPE_INT); - fprintf(out, " mov rax, %d\n", expr->literal.as_int); - - } else if (expr->type == AST_FUNCCALL) { - generate_func_call(expr, out); - - } else if (expr->type == AST_VAR) { - i64 offset = expr->variable->offset; - if (offset > 0) - fprintf(out, " mov rax, [rbp-%lld]\n", offset); - else - fprintf(out, " mov rax, [rbp+%lld]\n", -offset); - - } else if (expr->type == OP_ASSIGN) { - i64 offset = expr->assign.var->offset; - generate_expr_into_rax(expr->assign.value, out); - fprintf(out, " mov [rbp-%lld], rax\n", offset); - - } else if (expr->type == OP_NEG) { - generate_expr_into_rax(expr->unary_expr, out); - fprintf(out, " neg rax\n"); - - } else if (expr->type == OP_NOT) { - generate_expr_into_rax(expr->unary_expr, out); - // Booleanize - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " sete al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_BWINV) { - generate_expr_into_rax(expr->unary_expr, out); - fprintf(out, " not rax\n"); - - } else if (expr->type == OP_PLUS) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " add rax, rbx\n"); - - } else if (expr->type == OP_MINUS) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " sub rax, rbx\n"); - - } else if (expr->type == OP_DIV) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cqo\n"); - fprintf(out, " idiv rbx\n"); - - } else if (expr->type == OP_MUL) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " imul rbx\n"); - - // TODO: Compress these, there's barely any differences - } else if (expr->type == OP_EQ) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " sete al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_NEQ) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " setne al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_LT) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " setl al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_LEQ) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " setle al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_GT) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " setg al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_GEQ) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " setge al\n"); - fprintf(out, " movzx rax, al\n"); - - } else if (expr->type == OP_GEQ) { - generate_expr_into_rax(expr->binary.right, out); - fprintf(out, " push rax\n"); - generate_expr_into_rax(expr->binary.left, out); - fprintf(out, " pop rbx\n"); - fprintf(out, " cmp rax, rbx\n"); - fprintf(out, " setge al\n"); - fprintf(out, " movzx rax, al\n"); - - // Note: These are different because of short-circuit evaluation! - } else if (expr->type == OP_OR) { - generate_expr_into_rax(expr->binary.left, out); - // If left is true, we can short-circuit - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " je .or_right_%d\n", label_counter); - fprintf(out, " mov rax, 1\n"); - fprintf(out, " jmp .or_end_%d\n", label_counter); - fprintf(out, ".or_right_%d:\n", label_counter); - generate_expr_into_rax(expr->binary.right, out); - // Booleanize the result - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " setne al\n"); - fprintf(out, ".or_end_%d:\n", label_counter); - label_counter++; - - } else if (expr->type == OP_AND) { - generate_expr_into_rax(expr->binary.left, out); - // If left is false, we can short-circuit - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " jne .and_right_%d\n", label_counter); - fprintf(out, " mov rax, 0\n"); - fprintf(out, " jmp .and_end_%d\n", label_counter); - fprintf(out, ".and_right_%d:\n", label_counter); - generate_expr_into_rax(expr->binary.right, out); - // Booleanize the result - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " setne al\n"); - fprintf(out, ".and_end_%d:\n", label_counter); - label_counter++; - - } else if (expr->type == AST_CONDITIONAL) { - generate_expr_into_rax(expr->conditional.cond, out); - // If left is false, we can short-circuit - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " je .cond_else_%d\n", label_counter); - generate_expr_into_rax(expr->conditional.do_then, out); - fprintf(out, " jmp .cond_end_%d\n", label_counter); - fprintf(out, ".cond_else_%d:\n", label_counter); - generate_expr_into_rax(expr->binary.right, out); - // Booleanize the result - generate_expr_into_rax(expr->conditional.do_else, out); - fprintf(out, ".cond_end_%d:\n", label_counter); - label_counter++; - - } else { - fprintf(stderr, "Unsupported expression type in generate_expr: `%s`\n", node_type_to_str(expr->type)); - exit(1); - } -} - -void generate_block(Node *block, FILE *out); - -void generate_statement(Node *stmt, FILE *out) -{ - if (stmt->type == AST_RETURN) { - generate_expr_into_rax(stmt->unary_expr, out); - // TODO: Only do this if we have local variables - fprintf(out, " mov rsp, rbp\n"); - fprintf(out, " pop rbp\n"); - fprintf(out, " ret\n"); - } else if (stmt->type == AST_VARDECL) { - if (stmt->var_decl.value) { - generate_expr_into_rax(stmt->var_decl.value, out); - i64 offset = stmt->var_decl.var.offset; - fprintf(out, " mov [rbp-%lld], rax\n", offset); - } else { - // Initialize to 0 - i64 offset = stmt->var_decl.var.offset; - // TODO: Use correct size for the type - fprintf(out, " mov qword [rbp-%lld], 0\n", offset); - } - } else if (stmt->type == AST_IF) { - assert(stmt->conditional.cond); - assert(stmt->conditional.do_then); - int cur_label = label_counter++; - - generate_expr_into_rax(stmt->conditional.cond, out); - // If we don't have an `else` clause, we can simplify - if (!stmt->conditional.do_else) { - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " je .if_end_%d\n", cur_label); - generate_statement(stmt->conditional.do_then, out); - fprintf(out, ".if_end_%d:\n", cur_label); - } else { - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " je .if_else_%d\n", cur_label); - generate_statement(stmt->conditional.do_then, out); - fprintf(out, " jmp .if_end_%d\n", cur_label); - fprintf(out, ".if_else_%d:\n", cur_label); - generate_statement(stmt->conditional.do_else, out); - fprintf(out, ".if_end_%d:\n", cur_label); - } - } else if (stmt->type == AST_WHILE) { - int cur_label = label_counter++; - fprintf(out, ".loop_start_%d:\n", cur_label); - fprintf(out, ".loop_continue_%d:\n", cur_label); - generate_expr_into_rax(stmt->loop.cond, out); - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " je .loop_end_%d\n", cur_label); - generate_statement(stmt->loop.body, out); - fprintf(out, " jmp .loop_start_%d\n", cur_label); - fprintf(out, ".loop_end_%d:\n", cur_label); - - } else if (stmt->type == AST_FOR) { - int cur_label = label_counter++; - if (stmt->loop.init) { - generate_statement(stmt->loop.init, out); - } - fprintf(out, ".loop_start_%d:\n", cur_label); - if (stmt->loop.cond) { - generate_expr_into_rax(stmt->loop.cond, out); - fprintf(out, " cmp rax, 0\n"); - fprintf(out, " je .loop_end_%d\n", cur_label); - } - generate_statement(stmt->loop.body, out); - fprintf(out, ".loop_continue_%d:\n", cur_label); - if (stmt->loop.step) { - generate_expr_into_rax(stmt->loop.step, out); - } - fprintf(out, " jmp .loop_start_%d\n", cur_label); - fprintf(out, ".loop_end_%d:\n", cur_label); - - } else if (stmt->type == AST_BLOCK) { - generate_block(stmt, out); - } else { - // Once again, default to an expression here... - generate_expr_into_rax(stmt, out); - } -} - -void generate_block(Node *block, FILE *out) -{ - assert(block->type == AST_BLOCK); - for (int i = 0; i < block->block.num_children; i++) - generate_statement(block->block.children[i], out); -} - -void generate_function_header(Node *func, FILE *out) -{ - assert(func->type == AST_FUNC); - fprintf(out, "global %s\n", func->func.name); - fprintf(out, "%s:\n", func->func.name); - // TODO: Only do this if we have local variables - fprintf(out, " push rbp\n"); - fprintf(out, " mov rbp, rsp\n"); - // FIXME: Also account for arguments - fprintf(out, " sub rsp, %lld\n", func->func.max_locals_size); -} - -void generate_function(Node *func, FILE *out) -{ - assert(func->type == AST_FUNC); - current_function = func; - generate_function_header(func, out); - generate_block(func->func.body, out); -} - -void generate_asm(Node *root, FILE *out) -{ - assert(root->type == AST_PROGRAM); - for (int i = 0; i < root->block.num_children; i++) { - if (root->block.children[i]->type == AST_FUNC) { - generate_function(root->block.children[i], out); - } else { - fprintf(stderr, "Unsupported node type in generate_asm: %s\n", node_type_to_str(root->block.children[i]->type)); - exit(1); - } - } - - // Call `main` from `_main` and return -#if __APPLE__ - fprintf(out, "global _main\n"); - fprintf(out, "_main:\n"); -#else - fprintf(out, "global _start\n"); - fprintf(out, "_start:\n"); -#endif - fprintf(out, " call main\n"); - -#if __APPLE__ - fprintf(out, " ret\n"); -#else - fprintf(out, " mov rdi, rax\n"); - fprintf(out, " mov rax, %d\n", SYS_exit); - fprintf(out, " syscall\n"); -#endif - - // TODO: Add implementations of some primitives? -} \ No newline at end of file diff --git a/cup/generator.h b/cup/generator.h deleted file mode 100644 index 602cf3a..0000000 --- a/cup/generator.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include "ast.h" -#include - -void generate_asm(Node *root, FILE *out); \ No newline at end of file diff --git a/cup/lexer.c b/cup/lexer.c deleted file mode 100644 index ea095ba..0000000 --- a/cup/lexer.c +++ /dev/null @@ -1,238 +0,0 @@ -#include "lexer.h" -#include -#include -#include -#include -#include "utils.h" - -Lexer Lexer_new(char *filename, char *src, i64 len) -{ - Lexer self = {0}; - self.src = src; - self.len = len; - self.filename = filename; - return self; -} - -static Location Lexer_loc(Lexer *lexer) -{ - Location loc = {0}; - loc.filename = lexer->filename; - loc.line = lexer->line; - loc.col = lexer->col; - return loc; -} - -static void Lexer_skip_whitespace(Lexer *lexer) -{ - while (lexer->pos < lexer->len && isspace(lexer->src[lexer->pos])) { - if (lexer->src[lexer->pos] == '\n') { - lexer->line++; - lexer->col = 0; - } else { - lexer->col++; - } - lexer->pos++; - } -} - -bool Lexer_has_more(Lexer *lexer) -{ - Lexer_skip_whitespace(lexer); - return lexer->pos < lexer->len; -} - -static bool Lexer_starts_with(Lexer *lexer, char *str) -{ - i64 len = strlen(str); - if (lexer->len - lexer->pos < len) - return false; - for (i64 i = 0; i < len; i++) - if (lexer->src[lexer->pos + i] != str[i]) - return false; - i64 end_pos = lexer->pos + len; - if (end_pos == lexer->len) - return true; - char end_char = lexer->src[end_pos]; - return !(isdigit(end_char) || isalpha(end_char) || end_char == '_'); -} - -static void advance(Lexer *lexer, i64 amount) -{ - lexer->pos += amount; - lexer->col += amount; -} - -static char peek(Lexer *lexer, int amount) -{ - if (lexer->pos + amount >= lexer->len) - return '\0'; - return lexer->src[lexer->pos + amount]; -} - -static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount) -{ - Token token = {0}; - token.type = type; - token.loc = Lexer_loc(lexer); - advance(lexer, inc_amount); - return token; -} - -#define LEX_KEYWORD(str, token_type) \ - if (Lexer_starts_with(lexer, str)) return Lexer_make_token(lexer, token_type, strlen(str)); - -Token Lexer_next(Lexer *lexer) -{ - while (lexer->pos < lexer->len) { - switch (lexer->src[lexer->pos]) - { - case ' ': case '\t': case '\r': advance(lexer, 1); continue; - case '\n': lexer->line++; lexer->col = 0; lexer->pos++; continue; - case '(': return Lexer_make_token(lexer, TOKEN_OPEN_PAREN, 1); - case ')': return Lexer_make_token(lexer, TOKEN_CLOSE_PAREN, 1); - case '{': return Lexer_make_token(lexer, TOKEN_OPEN_BRACE, 1); - case '}': return Lexer_make_token(lexer, TOKEN_CLOSE_BRACE, 1); - case ';': return Lexer_make_token(lexer, TOKEN_SEMICOLON, 1); - case ':': return Lexer_make_token(lexer, TOKEN_COLON, 1); - case '~': return Lexer_make_token(lexer, TOKEN_TILDE, 1); - case '?': return Lexer_make_token(lexer, TOKEN_QUESTION, 1); - case ',': return Lexer_make_token(lexer, TOKEN_COMMA, 1); - - case '&': { - if (peek(lexer, 1) == '&') - return Lexer_make_token(lexer, TOKEN_AND, 2); - return Lexer_make_token(lexer, TOKEN_AMPERSAND, 1); - } - - case '!': { - if (peek(lexer, 1) == '=') - return Lexer_make_token(lexer, TOKEN_NEQ, 2); - return Lexer_make_token(lexer, TOKEN_EXCLAMATION, 1); - } - - case '<': { - if (peek(lexer, 1) == '=') - return Lexer_make_token(lexer, TOKEN_LEQ, 2); - return Lexer_make_token(lexer, TOKEN_LT, 1); - } - - case '>': { - if (peek(lexer, 1) == '=') - return Lexer_make_token(lexer, TOKEN_GEQ, 2); - return Lexer_make_token(lexer, TOKEN_GT, 1); - } - - case '=': { - if (peek(lexer, 1) == '=') - return Lexer_make_token(lexer, TOKEN_EQ, 2); - return Lexer_make_token(lexer, TOKEN_ASSIGN, 1); - } - - case '|': { - if (peek(lexer, 1) == '|') - return Lexer_make_token(lexer, TOKEN_OR, 2); - return Lexer_make_token(lexer, TOKEN_BAR, 1); - } - - - case '+': { - if (peek(lexer, 1) == '+') - return Lexer_make_token(lexer, TOKEN_PLUSPLUS, 2); - if (peek(lexer, 1) == '=') - return Lexer_make_token(lexer, TOKEN_PLUSEQUALS, 2); - return Lexer_make_token(lexer, TOKEN_PLUS, 1); - } - - case '-': { - if (peek(lexer, 1) == '-') - return Lexer_make_token(lexer, TOKEN_MINUSMINUS, 2); - if (peek(lexer, 1) == '=') - return Lexer_make_token(lexer, TOKEN_MINUSEQUALS, 2); - return Lexer_make_token(lexer, TOKEN_MINUS, 1); - } - - case '/': { - if (peek(lexer, 1) == '/') { - lexer->pos += 2; // skip the '//' - while (lexer->pos < lexer->len && lexer->src[lexer->pos] != '\n') - lexer->pos++; - continue; - } - return Lexer_make_token(lexer, TOKEN_SLASH, 1); - } - - case '*': return Lexer_make_token(lexer, TOKEN_STAR, 1); - case '%': return Lexer_make_token(lexer, TOKEN_PERCENT, 1); - - - default: { - // Handle keywords explicitly - LEX_KEYWORD("fn", TOKEN_FN); - LEX_KEYWORD("if", TOKEN_IF); - LEX_KEYWORD("int", TOKEN_INT); - LEX_KEYWORD("let", TOKEN_LET); - LEX_KEYWORD("for", TOKEN_FOR); - LEX_KEYWORD("else", TOKEN_ELSE); - LEX_KEYWORD("while", TOKEN_WHILE); - LEX_KEYWORD("return", TOKEN_RETURN); - - if (isdigit(lexer->src[lexer->pos])) { - // TODO: Parse hex and octal numbers - i64 pos = lexer->pos; - while (pos < lexer->len && isdigit(lexer->src[pos])) - pos++; - Token token = Token_from_int(atoi(lexer->src + lexer->pos), Lexer_loc(lexer)); - advance(lexer, pos - lexer->pos); - return token; - } - - if (isalpha(lexer->src[lexer->pos]) || lexer->src[lexer->pos] == '_') { - i64 pos = lexer->pos; - while (pos < lexer->len && (isalnum(lexer->src[pos]) || lexer->src[pos] == '_')) - pos++; - int str_len = pos - lexer->pos; - char *str = calloc(str_len + 1, 1); - strncpy(str, lexer->src + lexer->pos, str_len); - Token token = Token_from_identifier(str, Lexer_loc(lexer)); - advance(lexer, str_len); - return token; - } - - // TODO: Handle escapes - if (lexer->src[lexer->pos] == '"') { - i64 pos = lexer->pos + 1; - while (pos < lexer->len && lexer->src[pos] != '"') - pos++; - if (pos == lexer->len) { - die_location(Lexer_loc(lexer), ": ERROR: Reached end-of-file while parsing string literal beginning here.\n"); - } - // Careful with indexing here, because we want to skip opening and closing quotes - char *str = calloc(pos - lexer->pos, 1); - strncpy(str, lexer->src + lexer->pos + 1, pos - lexer->pos - 1); - Token token = Token_from_identifier(str, Lexer_loc(lexer)); - advance(lexer, pos - lexer->pos + 1); - return token; - } - - - die_location(Lexer_loc(lexer), ": ERROR: Unexpected character '%c'\n", lexer->src[lexer->pos]); - advance(lexer, 1); - } - } - } - - return Token_from_type(TOKEN_EOF, Lexer_loc(lexer)); -} - -Token Lexer_peek(Lexer *lexer) -{ - i64 pos = lexer->pos; - i64 col = lexer->col; - i64 line = lexer->line; - Token token = Lexer_next(lexer); - lexer->pos = pos; - lexer->col = col; - lexer->line = line; - return token; -} \ No newline at end of file diff --git a/cup/lexer.h b/cup/lexer.h deleted file mode 100644 index f710d3d..0000000 --- a/cup/lexer.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "tokens.h" -#include - -typedef struct { - char *src; - i64 len; - i64 pos; - - char *filename; - i64 line; - i64 col; -} Lexer; - -Lexer Lexer_new(char *filename, char *src, i64 len); - -Token Lexer_next(Lexer *lexer); -Token Lexer_peek(Lexer *lexer); \ No newline at end of file diff --git a/cup/main.c b/cup/main.c deleted file mode 100644 index 15460ff..0000000 --- a/cup/main.c +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include -#include "lexer.h" -#include "parser.h" -#include "generator.h" -#include "unistd.h" - -char *filename = NULL; -char *source = NULL; -i64 source_len = 0; -bool dump_ast = false; - -#define MAX_STDIN_SOURCE_LEN 2048 - -void usage(int exit_status) -{ - printf("Usage: cup [options] \n"); - printf("Options:\n"); - printf(" -c Code to compile\n"); - printf(" -h Show this help\n"); - printf(" -d Dump AST to stdout\n"); - printf("Output file will be named `output.nasm`\n"); - exit(exit_status); -} - -void parse_args(int argc, char **argv) -{ - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-c") == 0) { - source_len = strlen(argv[i+1]); - source = calloc(source_len + 1, 1); - strcpy(source, argv[i+1]); - i++; - filename = "CLI"; - } else if (strcmp(argv[i], "-h") == 0) { - usage(0); - } else if (strcmp(argv[i], "-d") == 0) { - dump_ast = true; - } else if (filename == NULL) { - if (strcmp(argv[i], "-") == 0) { - filename = "stdin"; - source = calloc(MAX_STDIN_SOURCE_LEN, 1); - source_len = fread(source, 1, MAX_STDIN_SOURCE_LEN, stdin); - if (source_len == MAX_STDIN_SOURCE_LEN) { - fprintf(stderr, "Source too long to use through stdin\n"); - exit(1); - } - } else { - filename = argv[i]; - // Read entire file into memory - FILE *fp = fopen(filename, "r"); - fseek(fp, 0, SEEK_END); - source_len = ftell(fp); - fseek(fp, 0, SEEK_SET); - source = malloc(source_len + 1); - fread(source, source_len, 1, fp); - source[source_len] = 0; - fclose(fp); - } - } else { - usage(1); - } - } -} - -int main(int argc, char**argv) { - parse_args(argc, argv); - - Lexer lexer = Lexer_new(filename, source, source_len); - Node *ast = parse_program(&lexer); - - if (dump_ast) - print_ast(ast); - - FILE *f = fopen("output.nasm", "w"); - generate_asm(ast, f); - - - free(source); - return 0; -} diff --git a/cup/parser.c b/cup/parser.c deleted file mode 100644 index cfb5d97..0000000 --- a/cup/parser.c +++ /dev/null @@ -1,562 +0,0 @@ -#include "parser.h" -#include "utils.h" -#include -#include -#include - -#define MAX_FUNCTION_COUNT 1024 -static Node *all_functions[MAX_FUNCTION_COUNT]; -static i64 function_count = 0; - -static Node *current_function = NULL; - -#define BLOCK_STACK_SIZE 64 -static Node *block_stack[BLOCK_STACK_SIZE]; -static i64 block_stack_count = 0; -static i64 cur_stack_offset = 0; - - -Token do_assert_token(Token token, TokenType type, char *filename, int line) -{ - if (token.type != type) { - Location_print(stderr, token.loc); - fprintf(stderr, ": Expected token of type `%s` but got `%s`\n", token_type_to_str(type), token_type_to_str(token.type)); - fprintf(stderr, "Relevant location in compiler: %s:%d\n", filename, line); - exit(1); - } - return token; -} - -#define assert_token(token, type) do_assert_token(token, type, __FILE__, __LINE__) - -/****** - * Some helpers - */ - -NodeType binary_token_to_op(TokenType type) -{ - switch (type) - { - case TOKEN_PLUS: return OP_PLUS; - case TOKEN_MINUS: return OP_MINUS; - case TOKEN_STAR: return OP_MUL; - case TOKEN_SLASH: return OP_DIV; - case TOKEN_PERCENT: return OP_MOD; - case TOKEN_LSHIFT: return OP_LSHIFT; - case TOKEN_RSHIFT: return OP_RSHIFT; - case TOKEN_AND: return OP_AND; - case TOKEN_OR: return OP_OR; - case TOKEN_XOR: return OP_XOR; - case TOKEN_EQ: return OP_EQ; - case TOKEN_NEQ: return OP_NEQ; - case TOKEN_LT: return OP_LT; - case TOKEN_LEQ: return OP_LEQ; - case TOKEN_GT: return OP_GT; - case TOKEN_GEQ: return OP_GEQ; - - default: assert(false && "binary_token_to_op called with invalid token type"); - } -} - -void block_stack_push(Node *block) -{ - assert(block_stack_count < BLOCK_STACK_SIZE); - assert(current_function); - block_stack[block_stack_count++] = block; -} - -void block_stack_pop() -{ - assert(block_stack_count > 0); - assert(current_function); - Node *block = block_stack[--block_stack_count]; - cur_stack_offset -= block->block.locals_size; - assert(cur_stack_offset >= 0); -} - -void dump_block_stack() -{ - for (i64 i = 0; i < block_stack_count; i++) { - Node *block = block_stack[i]; - for (int i = 0; i < block->block.num_locals; i++) { - printf("%s: offset: %lld\n", block->block.locals[i]->name, block->block.locals[i]->offset); - } - printf("\n"); - } -} - -Variable *find_local_variable(Token *token) -{ - assert_token(*token, TOKEN_IDENTIFIER); - for (i64 i = block_stack_count - 1; i >= 0; --i) { - Node *block = block_stack[i]; - for (int i = 0; i < block->block.num_locals; i++) { - if (strcmp(block->block.locals[i]->name, token->value.as_string) == 0) { - return block->block.locals[i]; - } - } - } - Node *func = current_function; - for (int i = 0; i < func->func.num_args; i++) { - if (strcmp(func->func.args[i].name, token->value.as_string) == 0) { - return &func->func.args[i]; - } - } - return NULL; -} - -Node *find_function_definition(Token *token) -{ - assert_token(*token, TOKEN_IDENTIFIER); - for (i64 i = 0; i < function_count; i++) { - Node *function = all_functions[i]; - if (strcmp(function->func.name, token->value.as_string) == 0) { - return function; - } - } - return NULL; -} - -// TODO: rename this, it's ugly -void add_variable_to_current_block(Variable *var) -{ - // Set offset for variable - Node *cur_block = block_stack[block_stack_count - 1]; - var->offset = cur_stack_offset; - - int new_len = (cur_block->block.num_locals + 1); - int var_size = 8; // TODO: Compute sizes based on different types - - // Add to the block - // FIXME: Use a map here - cur_block->block.locals = realloc(cur_block->block.locals, sizeof(Variable *) * new_len); - cur_block->block.locals[cur_block->block.num_locals] = var; - cur_block->block.num_locals++; - - assert(current_function); - // Update current stack offset (w.r.t function stack frame) and block size - cur_stack_offset += var_size; - block_stack[block_stack_count-1]->block.locals_size += var_size; - - // Update function's max locals size - i64 max_offset = i64max(current_function->func.max_locals_size, cur_stack_offset); - current_function->func.max_locals_size = max_offset; - - assert(cur_stack_offset >= 0); - assert(block_stack_count > 0); -} - -void Node_add_child(Node *parent, Node *child) -{ - // TODO, use a vector - parent->block.children = realloc(parent->block.children, sizeof(Node *) * (parent->block.num_children + 1)); - parent->block.children[parent->block.num_children] = child; - parent->block.num_children++; -} - -Node *Node_new(NodeType type) -{ - Node *self = calloc(sizeof(Node), 1); - self->type = type; - return self; -} - -Type parse_type(Lexer *lexer) -{ - Type type = {0}; - Token token = Lexer_peek(lexer); - if (token.type == TOKEN_INT) { - type.type = TYPE_INT; - Lexer_next(lexer); - } else { - type.type = TYPE_NONE; - } - - while (Lexer_peek(lexer).type == TOKEN_AMPERSAND) { - Lexer_next(lexer); - type.indirection++; - } - - return type; -} - -Node *parse_literal(Lexer *lexer) -{ - Node *node = Node_new(AST_LITERAL); - Token token = assert_token(Lexer_next(lexer), TOKEN_INTLIT); - node->literal.type = (Type) {.type = TYPE_INT}; - node->literal.as_int = token.value.as_int; - return node; -} - -Node *parse_expression(Lexer *); - -Node *parse_var_declaration(Lexer *lexer) -{ - Token token = assert_token(Lexer_next(lexer), TOKEN_LET); - // TODO: Reuse this for globals? Or maybe just make a new function? - if (!current_function || current_function->type != AST_FUNC) - die_location(token.loc, "Variable declaration outside of function"); - - token = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); - // NOTE: We don't allow shadowing of variables in the any blocks, - // this is by design since it's a common mistake. - if (find_local_variable(&token) != NULL) - die_location(token.loc, "Variable `%s` already declared", token.value.as_string); - - Node *node = Node_new(AST_VARDECL); - node->var_decl.var.name = token.value.as_string; - - token = Lexer_next(lexer); - if (token.type != TOKEN_COLON) - die_location(token.loc, "Missing type specifier for variable `%s`", node->var_decl.var.name); - node->var_decl.var.type = parse_type(lexer); - - add_variable_to_current_block(&node->var_decl.var); - - token = Lexer_next(lexer); - if (token.type == TOKEN_ASSIGN) { - node->var_decl.value = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); - } else { - assert_token(token, TOKEN_SEMICOLON); - } - - return node; -} - -Node *parse_function_call_args(Lexer *lexer, Node *func) -{ - Token identifier = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); - Node *call = Node_new(AST_FUNCCALL); - call->call.func = func; - assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); - Token token = Lexer_peek(lexer); - - while (token.type != TOKEN_CLOSE_PAREN) { - Node *arg = parse_expression(lexer); - - int new_size = call->call.num_args + 1; - call->call.args = realloc(call->call.args, sizeof(Node *) * new_size); - call->call.args[call->call.num_args++] = arg; - - if (new_size > func->func.num_args) - die_location(identifier.loc, "Too many arguments to function `%s`", func->func.name); - - token = Lexer_peek(lexer); - if (token.type == TOKEN_COMMA) { - Lexer_next(lexer); - token = Lexer_peek(lexer); - } - } - - if (call->call.num_args != func->func.num_args) - die_location(identifier.loc, "Too few arguments to function `%s`", func->func.name); - - assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - return call; -} - -Node *parse_factor(Lexer *lexer) -{ - // TODO: Parse more complicated things - Token token = Lexer_peek(lexer); - Node *expr; - if (token.type == TOKEN_MINUS) { - Lexer_next(lexer); - expr = Node_new(OP_NEG); - expr->unary_expr = parse_factor(lexer); - } else if (token.type == TOKEN_TILDE) { - Lexer_next(lexer); - expr = Node_new(OP_BWINV); - expr->unary_expr = parse_factor(lexer); - } else if (token.type == TOKEN_EXCLAMATION) { - Lexer_next(lexer); - expr = Node_new(OP_NOT); - expr->unary_expr = parse_factor(lexer); - } else if (token.type == TOKEN_OPEN_PAREN) { - Lexer_next(lexer); - expr = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - } else if (token.type == TOKEN_INTLIT) { - expr = parse_literal(lexer); - } else if (token.type == TOKEN_IDENTIFIER) { - // TODO: Check for global variables when added - - Variable *var = find_local_variable(&token); - if (var != NULL) { - Lexer_next(lexer); - expr = Node_new(AST_VAR); - expr->variable = var; - return expr; - } - - Node *func = find_function_definition(&token); - if (func != NULL) { - return parse_function_call_args(lexer, func); - } - - die_location(token.loc, "Unknown identifier `%s`", token.value.as_string); - expr = NULL; - } else { - die_location(token.loc, ": Expected token found in parse_factor: `%s`", token_type_to_str(token.type)); - exit(1); - } - return expr; -} - -#define BINOP_PARSER(next_parser, predicate) \ - Node *expr = next_parser(lexer); \ - Token token = Lexer_peek(lexer); \ - while (predicate(token.type)) { \ - Lexer_next(lexer); \ - Node *op = Node_new(binary_token_to_op(token.type)); \ - Node *right = next_parser(lexer); \ - op->binary.left = expr; \ - op->binary.right = right; \ - expr = op; \ - token = Lexer_peek(lexer); \ - } \ - return expr; - -bool is_term_token(TokenType type) { return type == TOKEN_STAR || type == TOKEN_SLASH || type == TOKEN_PERCENT; } -Node *parse_term(Lexer *lexer) { BINOP_PARSER(parse_factor, is_term_token); } - -bool is_additive_token(TokenType type) { return type == TOKEN_PLUS || type == TOKEN_MINUS; } -Node *parse_additive(Lexer *lexer) { BINOP_PARSER(parse_term, is_additive_token); } - -bool is_relational_token(TokenType type) { return type == TOKEN_LT || type == TOKEN_LEQ || type == TOKEN_GT || type == TOKEN_GEQ; } -Node *parse_relational(Lexer *lexer) { BINOP_PARSER(parse_additive, is_relational_token); } - -bool is_equality_token(TokenType type) { return type == TOKEN_EQ || type == TOKEN_NEQ; } -Node *parse_equality(Lexer *lexer) { BINOP_PARSER(parse_relational, is_equality_token); } - -bool is_logical_and_token(TokenType type) { return type == TOKEN_AND; } -Node *parse_logical_and(Lexer *lexer) { BINOP_PARSER(parse_equality, is_logical_and_token); } - -bool is_logical_or_token(TokenType type) { return type == TOKEN_OR; } -Node *parse_logical_or(Lexer *lexer) { BINOP_PARSER(parse_logical_and, is_logical_or_token); } - -Node *parse_conditional_exp(Lexer *lexer) -{ - Node *expr = parse_logical_or(lexer); - Token token = Lexer_peek(lexer); - if (token.type == TOKEN_QUESTION) { - Lexer_next(lexer); - Node *then_expr = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_COLON); - Node *else_expr = parse_expression(lexer); - Node *conditional = Node_new(AST_CONDITIONAL); - conditional->conditional.cond = expr; - conditional->conditional.do_then = then_expr; - conditional->conditional.do_else = else_expr; - expr = conditional; - } - return expr; -} - -Node *parse_expression(Lexer *lexer) -{ - Node *node = parse_conditional_exp(lexer); - // FIXME: This is a hack to handle assignment expressions - // and can probably be done properly. - if (node->type == AST_VAR) { - Token token = Lexer_peek(lexer); - if (token.type == TOKEN_ASSIGN) { - Lexer_next(lexer); - Variable *var = node->variable; - node->type = OP_ASSIGN; - node->assign.var = var; - node->assign.value = parse_expression(lexer); - } - } - return node; -} - -Node *parse_block(Lexer *lexer); - -Node *parse_statement(Lexer *lexer) -{ - Node *node; - Token token = Lexer_peek(lexer); - - if (token.type == TOKEN_RETURN) { - assert_token(Lexer_next(lexer), TOKEN_RETURN); - node = Node_new(AST_RETURN); - node->unary_expr = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); - } else if (token.type == TOKEN_IF) { - Lexer_next(lexer); - node = Node_new(AST_IF); - assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); - node->conditional.cond = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - // TODO: Allow blocks in here once implemented, currently - // we can onle have a single statement in the if/else - node->conditional.do_then = parse_statement(lexer); - token = Lexer_peek(lexer); - if (token.type == TOKEN_ELSE) { - Lexer_next(lexer); - node->conditional.do_else = parse_statement(lexer); - } - } else if (token.type == TOKEN_WHILE) { - Lexer_next(lexer); - node = Node_new(AST_WHILE); - assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); - node->loop.cond = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - node->loop.body = parse_statement(lexer); - } else if (token.type == TOKEN_FOR) { - Lexer_next(lexer); - node = Node_new(AST_FOR); - assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); - - // All of the expressions in the for loop are optional - - // TODO: Allow this to be a declaration, need to inject - // the variable into the symbol table for the block - if (Lexer_peek(lexer).type != TOKEN_SEMICOLON) - node->loop.init = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); - - if (Lexer_peek(lexer).type != TOKEN_SEMICOLON) - node->loop.cond = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); - - if (Lexer_peek(lexer).type != TOKEN_CLOSE_PAREN) - node->loop.step = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - - node->loop.body = parse_statement(lexer); - } else if (token.type == TOKEN_OPEN_BRACE) { - node = parse_block(lexer); - } else { - // Default to trying to handle it as an expression - node = parse_expression(lexer); - assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); - } - - return node; -} - -Node *parse_block(Lexer *lexer) -{ - assert_token(Lexer_next(lexer), TOKEN_OPEN_BRACE); - - Node *block = Node_new(AST_BLOCK); - Token token = Lexer_peek(lexer); - - block_stack_push(block); - - while (token.type != TOKEN_CLOSE_BRACE) { - Node *block_item; - if (token.type == TOKEN_LET) { - block_item = parse_var_declaration(lexer); - } else { - // Default to a statement - block_item = parse_statement(lexer); - } - Node_add_child(block, block_item); - token = Lexer_peek(lexer); - } - - block_stack_pop(); - - assert_token(Lexer_next(lexer), TOKEN_CLOSE_BRACE); - return block; -} - -void push_new_function(Node *func) -{ - assert(func->type == AST_FUNC); - assert(function_count < MAX_FUNCTION_COUNT); - all_functions[function_count++] = func; - current_function = func; -} - -void parse_func_args(Lexer *lexer, Node *func) -{ - assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); - Token token = Lexer_peek(lexer); - while (token.type != TOKEN_CLOSE_PAREN) { - token = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); - // TODO: Check for shadowing with globals - assert_token(Lexer_next(lexer), TOKEN_COLON); - Type type = parse_type(lexer); - - i64 new_count = func->func.num_args + 1; - func->func.args = realloc(func->func.args, sizeof(Variable) * new_count); - Variable *var = &func->func.args[func->func.num_args++]; - var->name = token.value.as_string; - var->type = type; - - token = Lexer_peek(lexer); - if (token.type == TOKEN_COMMA) { - Lexer_next(lexer); - token = Lexer_peek(lexer); - } - } - assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - - // Set the offsets for the arguments - - // IMPORTANT: We want to skip the saved ret_addr+old_rbp that we - // pushed on the stack. Each of these is 8 bytes. - int offset = -16; - for (int i = 0; i < func->func.num_args; i++) { - Variable *var = &func->func.args[i]; - var->offset = offset; - // TODO: Compute this for different types - int var_size = 8; - offset -= var_size; - } -} - -Node *parse_func(Lexer *lexer) -{ - Token token; - token = assert_token(Lexer_next(lexer), TOKEN_FN); - - Node *func = Node_new(AST_FUNC); - push_new_function(func); - - token = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); - func->func.name = token.value.as_string; - parse_func_args(lexer, func); - - token = Lexer_peek(lexer); - if (token.type == TOKEN_COLON) { - // TODO: Parse all return types - assert_token(Lexer_next(lexer), TOKEN_COLON); - func->func.return_type = parse_type(lexer); - } else { - // No return type, void fn. - func->func.return_type = (Type){.type = TYPE_NONE}; - } - - // Make sure there's no funny business with the stack offset - assert(cur_stack_offset == 0); - assert(block_stack_count == 0); - func->func.body = parse_block(lexer); - assert(block_stack_count == 0); - assert(cur_stack_offset == 0); - - return func; -} - -Node *parse_program(Lexer *lexer) -{ - Node *program = Node_new(AST_PROGRAM); - Token token; - while ((token = Lexer_peek(lexer)).type != TOKEN_EOF) { - if (token.type == TOKEN_FN) { - Node *func = parse_func(lexer); - Node_add_child(program, func); - } else { - die_location(token.loc, "Unexpected token in parse_program: `%s`\n", token_type_to_str(token.type)); - exit(1); - break; - } - } - return program; -} \ No newline at end of file diff --git a/cup/parser.h b/cup/parser.h deleted file mode 100644 index 7f7dacb..0000000 --- a/cup/parser.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#include "ast.h" -#include "lexer.h" - -Node *parse_program(Lexer *lexer); -void print_ast(Node *node); \ No newline at end of file diff --git a/cup/tokens.c b/cup/tokens.c deleted file mode 100644 index e562e16..0000000 --- a/cup/tokens.c +++ /dev/null @@ -1,70 +0,0 @@ -#include "tokens.h" -#include -#include - -Token Token_from_type(TokenType type, Location loc) -{ - Token token = {0}; - token.type = type; - token.loc = loc; - return token; -} - -Token Token_from_int(i64 value, Location loc) -{ - Token token = {0}; - token.type = TOKEN_INTLIT; - token.value.as_int = value; - token.loc = loc; - return token; -} - -Token Token_from_string(char *value, Location loc) -{ - Token token = {0}; - token.type = TOKEN_STRINGLIT; - token.value.as_string = value; - token.loc = loc; - return token; -} - -Token Token_from_identifier(char *value, Location loc) -{ - Token token = {0}; - token.type = TOKEN_IDENTIFIER; - token.value.as_string = value; - token.loc = loc; - return token; -} - -void Location_print(FILE *f, Location loc) -{ - fprintf(f, "%s:%d:%d", loc.filename, loc.line+1, loc.col+1); -} - -char *token_type_to_str(TokenType type) -{ - // Otherwise, just print the token type - switch (type) - { - #define ENUM_TOKEN(name, str) case name: return str; - ENUM_TOKENS(ENUM_TOKEN) - #undef ENUM_TOKEN - default: assert(false && "Unreachable"); - } -} - - -void Token_print(FILE *f, Token *token) -{ - // Handle the different literals manually - switch (token->type) - { - case TOKEN_INTLIT: fprintf(f, "%lld", token->value.as_int); return; - case TOKEN_STRINGLIT: fprintf(f, "\"%s\"", token->value.as_string); return; - case TOKEN_IDENTIFIER: fprintf(f, "%s", token->value.as_string); return; - default: break; - } - - fprintf(f, "%s", token_type_to_str(token->type)); -} \ No newline at end of file diff --git a/cup/tokens.h b/cup/tokens.h deleted file mode 100644 index 44f5e6e..0000000 --- a/cup/tokens.h +++ /dev/null @@ -1,83 +0,0 @@ -#pragma once - -#include "common.h" -#include - -#define ENUM_TOKENS(F) \ - F(TOKEN_AMPERSAND, "&") \ - F(TOKEN_AND, "&&") \ - F(TOKEN_ASSIGN, "=") \ - F(TOKEN_BAR, "|") \ - F(TOKEN_CLOSE_BRACE, "}") \ - F(TOKEN_CLOSE_PAREN, ")") \ - F(TOKEN_COLON, ":") \ - F(TOKEN_COMMA, ",") \ - F(TOKEN_ELSE, "else") \ - F(TOKEN_EOF, "EOF") \ - F(TOKEN_EQ, "==") \ - F(TOKEN_EXCLAMATION, "!") \ - F(TOKEN_FN, "fn") \ - F(TOKEN_FOR, "for") \ - F(TOKEN_GEQ, ">=") \ - F(TOKEN_GT, ">") \ - F(TOKEN_IDENTIFIER, "identifier") \ - F(TOKEN_IF, "if") \ - F(TOKEN_INT, "int") \ - F(TOKEN_INTLIT, "integer literal") \ - F(TOKEN_LEQ, "<=") \ - F(TOKEN_LSHIFT, "<<") \ - F(TOKEN_LET, "let") \ - F(TOKEN_LT, "<") \ - F(TOKEN_MINUS, "-") \ - F(TOKEN_MINUSEQUALS, "-=") \ - F(TOKEN_MINUSMINUS, "--") \ - F(TOKEN_NEQ, "!=") \ - F(TOKEN_OPEN_BRACE, "{") \ - F(TOKEN_OPEN_PAREN, "(") \ - F(TOKEN_OR, "||") \ - F(TOKEN_PERCENT, "%") \ - F(TOKEN_PLUS, "+") \ - F(TOKEN_PLUSEQUALS, "+=") \ - F(TOKEN_PLUSPLUS, "++") \ - F(TOKEN_QUESTION, "?") \ - F(TOKEN_RETURN, "return") \ - F(TOKEN_RSHIFT, ">>") \ - F(TOKEN_SEMICOLON, ";") \ - F(TOKEN_SLASH, "/") \ - F(TOKEN_STAR, "*") \ - F(TOKEN_STRINGLIT, "string literal") \ - F(TOKEN_TILDE, "~") \ - F(TOKEN_WHILE, "while") \ - F(TOKEN_XOR, "^") - -typedef enum { -#define ENUM_TOKEN(name, str) name, - ENUM_TOKENS(ENUM_TOKEN) -#undef ENUM_TOKEN -} TokenType; - -typedef struct { - char *filename; - int line, col; -} Location; - -void Location_print(FILE *f, Location loc); - -typedef struct { - TokenType type; - Location loc; - union tokens { - i64 as_int; - char *as_string; - char as_char; - } value; -} Token; - -char *token_type_to_str(TokenType type); - -Token Token_from_type(TokenType type, Location loc); -Token Token_from_int(i64 value, Location loc); -Token Token_from_string(char *value, Location loc); -Token Token_from_identifier(char *value, Location loc); - -void Token_print(FILE *f, Token *token); \ No newline at end of file diff --git a/cup/utils.c b/cup/utils.c deleted file mode 100644 index 2335981..0000000 --- a/cup/utils.c +++ /dev/null @@ -1,30 +0,0 @@ -#include "utils.h" - -#include -#include -#include - -void die(const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - vfprintf(stderr, fmt, args); - va_end(args); - exit(1); -} - -void _die_location(char *file, int line, Location loc, const char *fmt, ...) -{ - Location_print(stderr, loc); - fprintf(stderr, ": "); - va_list args; - va_start(args, fmt); - vfprintf(stderr, fmt, args); - va_end(args); - fprintf(stderr, "\n"); - fprintf(stderr, "NOTE: Error occurred in %s:%d\n", file, line); - exit(1); -} - -i64 i64max(i64 a, i64 b) { return a > b ? a : b; } -i64 i64min(i64 a, i64 b) { return a < b ? a : b; } \ No newline at end of file diff --git a/cup/utils.h b/cup/utils.h deleted file mode 100644 index dfc018c..0000000 --- a/cup/utils.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include "common.h" -#include "tokens.h" - -void die(const char *fmt, ...); -void _die_location(char *file, int line, Location loc, const char *fmt, ...); - -i64 i64max(i64 a, i64 b); -i64 i64min(i64 a, i64 b); - -#define die_location(loc, ...) _die_location(__FILE__, __LINE__, loc, __VA_ARGS__) \ No newline at end of file diff --git a/src/ast.c b/src/ast.c new file mode 100644 index 0000000..6e33de1 --- /dev/null +++ b/src/ast.c @@ -0,0 +1,161 @@ +#include "ast.h" +#include +#include + +char *data_type_to_str(DataType type) +{ + switch (type) + { + case TYPE_NONE: return "void"; + case TYPE_INT: return "int"; + default: assert(false && "Unreachable"); + } +} + +void print_type_to_file(FILE *out, Type type) +{ + fprintf(out, "%s", data_type_to_str(type.type)); + for (int i = 0; i < type.indirection; i++) { + fprintf(out, "*"); + } +} + +char *node_type_to_str(NodeType type) +{ + switch (type) + { + #define ENUM_TOKEN(t, name) case t: return name; + ENUM_AST_TYPES(ENUM_TOKEN) + #undef ENUM_TOKEN + default: assert(false && "Unreachable"); + } +} + +bool is_binary_op(NodeType type) +{ + switch (type) + { + case OP_PLUS: + case OP_MINUS: + case OP_MUL: + case OP_DIV: + case OP_MOD: + case OP_LSHIFT: + case OP_RSHIFT: + case OP_AND: + case OP_OR: + case OP_XOR: + case OP_EQ: + case OP_NEQ: + case OP_LT: + case OP_LEQ: + case OP_GT: + case OP_GEQ: + return true; + default: return false; + } +} + +bool is_unary_op(NodeType type) +{ + switch (type) + { + case OP_NEG: + case OP_NOT: + case OP_BWINV: + return true; + default: return false; + } +} + +bool is_expression(NodeType type) +{ + if (is_unary_op(type) || is_binary_op(type)) + return true; + return type == AST_LITERAL; +} + +void dump_func(Node *, int); + +static void do_print_ast(Node *node, int depth) +{ + for (int i = 0; i < depth; i++) { + printf(" "); + } + if (node->type == AST_PROGRAM) { + for (int i = 0; i < node->block.num_children; i++) { + do_print_ast(node->block.children[i], depth); + } + } else if (node->type == AST_BLOCK) { + printf("{\n"); + for (int i = 0; i < node->block.num_children; i++) { + do_print_ast(node->block.children[i], depth + 1); + } + for (int i = 0; i < depth; i++) { + printf(" "); + } + printf("}\n"); + } else if (node->type == AST_FUNC) { + dump_func(node, depth); + } else if (node->type == AST_LITERAL) { + printf("%d\n", node->literal.as_int); + } else if (node->type == AST_RETURN) { + printf("return\n"); + do_print_ast(node->unary_expr, depth + 1); + } else if (is_unary_op(node->type)) { + printf("%s\n", node_type_to_str(node->type)); + do_print_ast(node->unary_expr, depth + 1); + } else if (is_binary_op(node->type)) { + printf("%s\n", node_type_to_str(node->type)); + do_print_ast(node->binary.left, depth + 1); + do_print_ast(node->binary.right, depth + 1); + } else if (node->type == AST_VAR) { + assert(node->variable && node->variable->name); + printf("%s\n", node->variable->name); + } else if (node->type == AST_VARDECL) { + printf("var %s (", node->var_decl.var.name); + print_type_to_file(stdout, node->var_decl.var.type); + printf(")"); + if (node->var_decl.value != NULL) { + printf(" = \n"); + do_print_ast(node->var_decl.value, depth + 1); + } else { + printf("\n"); + } + } else if (node->type == AST_FUNCCALL) { + printf("CALL %s(\n", node->call.func->func.name); + for (int i = 0; i < node->call.num_args; i++) { + do_print_ast(node->call.args[i], depth + 1); + } + for (int i = 0; i < depth; i++) { + printf(" "); + } + printf(")\n"); + } else { + printf("{{ %s }}\n", node_type_to_str(node->type)); + } +} + +void dump_func(Node *node, int depth) +{ + printf("fn %s(", node->func.name); + for (int i = 0; i < node->func.num_args; i++) { + if (i > 0) printf(", "); + printf("%s: ", node->func.args[i].name); + print_type_to_file(stdout, node->func.args[i].type); + printf("[[%lld]]", node->func.args[i].offset); + } + printf(")"); + if (node->func.return_type.type != TYPE_NONE) { + // FIXME: Print return type properly + printf(" -> "); + print_type_to_file(stdout, node->func.return_type); + } + + do_print_ast(node->func.body, depth + 1); +} + +void print_ast(Node *node) +{ + do_print_ast(node, 0); +} \ No newline at end of file diff --git a/src/ast.h b/src/ast.h new file mode 100644 index 0000000..22a7283 --- /dev/null +++ b/src/ast.h @@ -0,0 +1,150 @@ +#pragma once + +#include "common.h" + +#define ENUM_AST_TYPES(F) \ + F(OP_NEG, "neg") \ + F(OP_NOT, "!") \ + F(OP_BWINV, "~") \ + F(OP_PLUS, "+") \ + F(OP_MINUS, "-") \ + F(OP_MUL, "*") \ + F(OP_DIV, "/") \ + F(OP_MOD, "%") \ + F(OP_LSHIFT, "<<") \ + F(OP_RSHIFT, ">>") \ + F(OP_AND, "&&") \ + F(OP_OR, "||") \ + F(OP_XOR, "^") \ + F(OP_EQ, "==") \ + F(OP_NEQ, "!=") \ + F(OP_LT, "<") \ + F(OP_LEQ, "<=") \ + F(OP_GT, ">") \ + F(OP_GEQ, ">=") \ + F(OP_ASSIGN, "=") \ + F(AST_LITERAL, "literal") \ + F(AST_FUNCCALL, "Function call") \ + F(AST_CONDITIONAL, "conditional expression") \ + F(AST_IF, "if statement") \ + F(AST_WHILE, "while statement") \ + F(AST_FOR, "for statement") \ + F(AST_VARDECL, "variable decl") \ + F(AST_VAR, "variable") \ + F(AST_RETURN, "return") \ + F(AST_FUNC, "func") \ + F(AST_PROGRAM, "program") \ + F(AST_BLOCK, "block statements") + +typedef enum { +#define DEFINE_ENUM(name, str) name, + ENUM_AST_TYPES(DEFINE_ENUM) +#undef DEFINE_ENUM +} NodeType; + +char *node_type_to_str(NodeType type); + +bool is_binary_op(NodeType type); +bool is_unary_op(NodeType type); +bool is_expression(NodeType type); + +typedef enum { + TYPE_NONE, + TYPE_INT, +} DataType; + +char *data_type_to_str(DataType type); + +typedef struct { + DataType type; + // 0 = value, 1 = pointer, 2 = double pointer, ... + int indirection; +} Type; + +typedef struct { + char *name; + Type type; + i64 offset; +} Variable; + +typedef struct ast_node Node; +typedef struct ast_node { + NodeType type; + + union { + // Binary expr + struct { + Node *left; + Node *right; + } binary; + + // Unary expr + Node *unary_expr; + + // Function definition + struct { + char *name; + Type return_type; + Node *body; + + // TODO: Should we just dynamically allocate space on the + // stack for each block instead of storing this? + i64 max_locals_size; + + // TODO: Arguments / etc? + Variable *args; + int num_args; + } func; + + // Block of statements + struct { + Node **children; + int num_children; + + Variable **locals; + int num_locals; + i64 locals_size; + } block; + + struct { + Type type; + union { + int as_int; + }; + } literal; + + struct { + Variable var; + Node *value; + } var_decl; + + struct { + Variable *var; + Node *value; + } assign; + + struct { + Node *cond; + Node *do_then; + Node *do_else; + } conditional; + + // Used for all loops + struct { + Node *cond; + Node *init; + Node *step; + Node *body; + } loop; + + Variable *variable; + + struct { + Node *func; + Node **args; + int num_args; + } call; + }; +} Node; + +void print_ast(Node *node); \ No newline at end of file diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..20d057f --- /dev/null +++ b/src/common.h @@ -0,0 +1,6 @@ +#pragma once + +#include +#include + +typedef long long int i64; diff --git a/src/generator.c b/src/generator.c new file mode 100644 index 0000000..6a95d0c --- /dev/null +++ b/src/generator.c @@ -0,0 +1,355 @@ +/** + * This file generates x86-64 assembly code in NASM format from the parsed AST. + */ + +#include "generator.h" +#include +#include +#include + +#include + +static int label_counter = 0; +static Node *current_function = NULL; + +void generate_expr_into_rax(Node *expr, FILE *out); + +void generate_func_call(Node *node, FILE *out) +{ + assert(node->type == AST_FUNCCALL); + // FIXME: This seems like a big hack + i64 total_size = 0; + for (int i = node->call.num_args - 1; i >= 0; i--) { + Node *arg = node->call.args[i]; + generate_expr_into_rax(arg, out); + fprintf(out, " push rax\n"); + // TODO: Compute this for different types + // TODO: Also make sure of padding and stuff? + total_size += 8; + } + fprintf(out, " call %s\n", node->call.func->func.name); + fprintf(out, " add rsp, %lld\n", total_size); +} + +// The evaluated expression is stored into `rax` +void generate_expr_into_rax(Node *expr, FILE *out) +{ + // TODO: Different sized output for different types? + if (expr->type == AST_LITERAL) { + // TODO: More literal types + assert(expr->literal.type.type == TYPE_INT); + fprintf(out, " mov rax, %d\n", expr->literal.as_int); + + } else if (expr->type == AST_FUNCCALL) { + generate_func_call(expr, out); + + } else if (expr->type == AST_VAR) { + i64 offset = expr->variable->offset; + if (offset > 0) + fprintf(out, " mov rax, [rbp-%lld]\n", offset); + else + fprintf(out, " mov rax, [rbp+%lld]\n", -offset); + + } else if (expr->type == OP_ASSIGN) { + i64 offset = expr->assign.var->offset; + generate_expr_into_rax(expr->assign.value, out); + fprintf(out, " mov [rbp-%lld], rax\n", offset); + + } else if (expr->type == OP_NEG) { + generate_expr_into_rax(expr->unary_expr, out); + fprintf(out, " neg rax\n"); + + } else if (expr->type == OP_NOT) { + generate_expr_into_rax(expr->unary_expr, out); + // Booleanize + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " sete al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_BWINV) { + generate_expr_into_rax(expr->unary_expr, out); + fprintf(out, " not rax\n"); + + } else if (expr->type == OP_PLUS) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " add rax, rbx\n"); + + } else if (expr->type == OP_MINUS) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " sub rax, rbx\n"); + + } else if (expr->type == OP_DIV) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cqo\n"); + fprintf(out, " idiv rbx\n"); + + } else if (expr->type == OP_MUL) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " imul rbx\n"); + + // TODO: Compress these, there's barely any differences + } else if (expr->type == OP_EQ) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " sete al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_NEQ) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " setne al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_LT) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " setl al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_LEQ) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " setle al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_GT) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " setg al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_GEQ) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " setge al\n"); + fprintf(out, " movzx rax, al\n"); + + } else if (expr->type == OP_GEQ) { + generate_expr_into_rax(expr->binary.right, out); + fprintf(out, " push rax\n"); + generate_expr_into_rax(expr->binary.left, out); + fprintf(out, " pop rbx\n"); + fprintf(out, " cmp rax, rbx\n"); + fprintf(out, " setge al\n"); + fprintf(out, " movzx rax, al\n"); + + // Note: These are different because of short-circuit evaluation! + } else if (expr->type == OP_OR) { + generate_expr_into_rax(expr->binary.left, out); + // If left is true, we can short-circuit + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " je .or_right_%d\n", label_counter); + fprintf(out, " mov rax, 1\n"); + fprintf(out, " jmp .or_end_%d\n", label_counter); + fprintf(out, ".or_right_%d:\n", label_counter); + generate_expr_into_rax(expr->binary.right, out); + // Booleanize the result + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " setne al\n"); + fprintf(out, ".or_end_%d:\n", label_counter); + label_counter++; + + } else if (expr->type == OP_AND) { + generate_expr_into_rax(expr->binary.left, out); + // If left is false, we can short-circuit + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " jne .and_right_%d\n", label_counter); + fprintf(out, " mov rax, 0\n"); + fprintf(out, " jmp .and_end_%d\n", label_counter); + fprintf(out, ".and_right_%d:\n", label_counter); + generate_expr_into_rax(expr->binary.right, out); + // Booleanize the result + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " setne al\n"); + fprintf(out, ".and_end_%d:\n", label_counter); + label_counter++; + + } else if (expr->type == AST_CONDITIONAL) { + generate_expr_into_rax(expr->conditional.cond, out); + // If left is false, we can short-circuit + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " je .cond_else_%d\n", label_counter); + generate_expr_into_rax(expr->conditional.do_then, out); + fprintf(out, " jmp .cond_end_%d\n", label_counter); + fprintf(out, ".cond_else_%d:\n", label_counter); + generate_expr_into_rax(expr->binary.right, out); + // Booleanize the result + generate_expr_into_rax(expr->conditional.do_else, out); + fprintf(out, ".cond_end_%d:\n", label_counter); + label_counter++; + + } else { + fprintf(stderr, "Unsupported expression type in generate_expr: `%s`\n", node_type_to_str(expr->type)); + exit(1); + } +} + +void generate_block(Node *block, FILE *out); + +void generate_statement(Node *stmt, FILE *out) +{ + if (stmt->type == AST_RETURN) { + generate_expr_into_rax(stmt->unary_expr, out); + // TODO: Only do this if we have local variables + fprintf(out, " mov rsp, rbp\n"); + fprintf(out, " pop rbp\n"); + fprintf(out, " ret\n"); + } else if (stmt->type == AST_VARDECL) { + if (stmt->var_decl.value) { + generate_expr_into_rax(stmt->var_decl.value, out); + i64 offset = stmt->var_decl.var.offset; + fprintf(out, " mov [rbp-%lld], rax\n", offset); + } else { + // Initialize to 0 + i64 offset = stmt->var_decl.var.offset; + // TODO: Use correct size for the type + fprintf(out, " mov qword [rbp-%lld], 0\n", offset); + } + } else if (stmt->type == AST_IF) { + assert(stmt->conditional.cond); + assert(stmt->conditional.do_then); + int cur_label = label_counter++; + + generate_expr_into_rax(stmt->conditional.cond, out); + // If we don't have an `else` clause, we can simplify + if (!stmt->conditional.do_else) { + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " je .if_end_%d\n", cur_label); + generate_statement(stmt->conditional.do_then, out); + fprintf(out, ".if_end_%d:\n", cur_label); + } else { + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " je .if_else_%d\n", cur_label); + generate_statement(stmt->conditional.do_then, out); + fprintf(out, " jmp .if_end_%d\n", cur_label); + fprintf(out, ".if_else_%d:\n", cur_label); + generate_statement(stmt->conditional.do_else, out); + fprintf(out, ".if_end_%d:\n", cur_label); + } + } else if (stmt->type == AST_WHILE) { + int cur_label = label_counter++; + fprintf(out, ".loop_start_%d:\n", cur_label); + fprintf(out, ".loop_continue_%d:\n", cur_label); + generate_expr_into_rax(stmt->loop.cond, out); + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " je .loop_end_%d\n", cur_label); + generate_statement(stmt->loop.body, out); + fprintf(out, " jmp .loop_start_%d\n", cur_label); + fprintf(out, ".loop_end_%d:\n", cur_label); + + } else if (stmt->type == AST_FOR) { + int cur_label = label_counter++; + if (stmt->loop.init) { + generate_statement(stmt->loop.init, out); + } + fprintf(out, ".loop_start_%d:\n", cur_label); + if (stmt->loop.cond) { + generate_expr_into_rax(stmt->loop.cond, out); + fprintf(out, " cmp rax, 0\n"); + fprintf(out, " je .loop_end_%d\n", cur_label); + } + generate_statement(stmt->loop.body, out); + fprintf(out, ".loop_continue_%d:\n", cur_label); + if (stmt->loop.step) { + generate_expr_into_rax(stmt->loop.step, out); + } + fprintf(out, " jmp .loop_start_%d\n", cur_label); + fprintf(out, ".loop_end_%d:\n", cur_label); + + } else if (stmt->type == AST_BLOCK) { + generate_block(stmt, out); + } else { + // Once again, default to an expression here... + generate_expr_into_rax(stmt, out); + } +} + +void generate_block(Node *block, FILE *out) +{ + assert(block->type == AST_BLOCK); + for (int i = 0; i < block->block.num_children; i++) + generate_statement(block->block.children[i], out); +} + +void generate_function_header(Node *func, FILE *out) +{ + assert(func->type == AST_FUNC); + fprintf(out, "global %s\n", func->func.name); + fprintf(out, "%s:\n", func->func.name); + // TODO: Only do this if we have local variables + fprintf(out, " push rbp\n"); + fprintf(out, " mov rbp, rsp\n"); + // FIXME: Also account for arguments + fprintf(out, " sub rsp, %lld\n", func->func.max_locals_size); +} + +void generate_function(Node *func, FILE *out) +{ + assert(func->type == AST_FUNC); + current_function = func; + generate_function_header(func, out); + generate_block(func->func.body, out); +} + +void generate_asm(Node *root, FILE *out) +{ + assert(root->type == AST_PROGRAM); + for (int i = 0; i < root->block.num_children; i++) { + if (root->block.children[i]->type == AST_FUNC) { + generate_function(root->block.children[i], out); + } else { + fprintf(stderr, "Unsupported node type in generate_asm: %s\n", node_type_to_str(root->block.children[i]->type)); + exit(1); + } + } + + // Call `main` from `_main` and return +#if __APPLE__ + fprintf(out, "global _main\n"); + fprintf(out, "_main:\n"); +#else + fprintf(out, "global _start\n"); + fprintf(out, "_start:\n"); +#endif + fprintf(out, " call main\n"); + +#if __APPLE__ + fprintf(out, " ret\n"); +#else + fprintf(out, " mov rdi, rax\n"); + fprintf(out, " mov rax, %d\n", SYS_exit); + fprintf(out, " syscall\n"); +#endif + + // TODO: Add implementations of some primitives? +} \ No newline at end of file diff --git a/src/generator.h b/src/generator.h new file mode 100644 index 0000000..602cf3a --- /dev/null +++ b/src/generator.h @@ -0,0 +1,6 @@ +#pragma once + +#include "ast.h" +#include + +void generate_asm(Node *root, FILE *out); \ No newline at end of file diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..ea095ba --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,238 @@ +#include "lexer.h" +#include +#include +#include +#include +#include "utils.h" + +Lexer Lexer_new(char *filename, char *src, i64 len) +{ + Lexer self = {0}; + self.src = src; + self.len = len; + self.filename = filename; + return self; +} + +static Location Lexer_loc(Lexer *lexer) +{ + Location loc = {0}; + loc.filename = lexer->filename; + loc.line = lexer->line; + loc.col = lexer->col; + return loc; +} + +static void Lexer_skip_whitespace(Lexer *lexer) +{ + while (lexer->pos < lexer->len && isspace(lexer->src[lexer->pos])) { + if (lexer->src[lexer->pos] == '\n') { + lexer->line++; + lexer->col = 0; + } else { + lexer->col++; + } + lexer->pos++; + } +} + +bool Lexer_has_more(Lexer *lexer) +{ + Lexer_skip_whitespace(lexer); + return lexer->pos < lexer->len; +} + +static bool Lexer_starts_with(Lexer *lexer, char *str) +{ + i64 len = strlen(str); + if (lexer->len - lexer->pos < len) + return false; + for (i64 i = 0; i < len; i++) + if (lexer->src[lexer->pos + i] != str[i]) + return false; + i64 end_pos = lexer->pos + len; + if (end_pos == lexer->len) + return true; + char end_char = lexer->src[end_pos]; + return !(isdigit(end_char) || isalpha(end_char) || end_char == '_'); +} + +static void advance(Lexer *lexer, i64 amount) +{ + lexer->pos += amount; + lexer->col += amount; +} + +static char peek(Lexer *lexer, int amount) +{ + if (lexer->pos + amount >= lexer->len) + return '\0'; + return lexer->src[lexer->pos + amount]; +} + +static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount) +{ + Token token = {0}; + token.type = type; + token.loc = Lexer_loc(lexer); + advance(lexer, inc_amount); + return token; +} + +#define LEX_KEYWORD(str, token_type) \ + if (Lexer_starts_with(lexer, str)) return Lexer_make_token(lexer, token_type, strlen(str)); + +Token Lexer_next(Lexer *lexer) +{ + while (lexer->pos < lexer->len) { + switch (lexer->src[lexer->pos]) + { + case ' ': case '\t': case '\r': advance(lexer, 1); continue; + case '\n': lexer->line++; lexer->col = 0; lexer->pos++; continue; + case '(': return Lexer_make_token(lexer, TOKEN_OPEN_PAREN, 1); + case ')': return Lexer_make_token(lexer, TOKEN_CLOSE_PAREN, 1); + case '{': return Lexer_make_token(lexer, TOKEN_OPEN_BRACE, 1); + case '}': return Lexer_make_token(lexer, TOKEN_CLOSE_BRACE, 1); + case ';': return Lexer_make_token(lexer, TOKEN_SEMICOLON, 1); + case ':': return Lexer_make_token(lexer, TOKEN_COLON, 1); + case '~': return Lexer_make_token(lexer, TOKEN_TILDE, 1); + case '?': return Lexer_make_token(lexer, TOKEN_QUESTION, 1); + case ',': return Lexer_make_token(lexer, TOKEN_COMMA, 1); + + case '&': { + if (peek(lexer, 1) == '&') + return Lexer_make_token(lexer, TOKEN_AND, 2); + return Lexer_make_token(lexer, TOKEN_AMPERSAND, 1); + } + + case '!': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_NEQ, 2); + return Lexer_make_token(lexer, TOKEN_EXCLAMATION, 1); + } + + case '<': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_LEQ, 2); + return Lexer_make_token(lexer, TOKEN_LT, 1); + } + + case '>': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_GEQ, 2); + return Lexer_make_token(lexer, TOKEN_GT, 1); + } + + case '=': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_EQ, 2); + return Lexer_make_token(lexer, TOKEN_ASSIGN, 1); + } + + case '|': { + if (peek(lexer, 1) == '|') + return Lexer_make_token(lexer, TOKEN_OR, 2); + return Lexer_make_token(lexer, TOKEN_BAR, 1); + } + + + case '+': { + if (peek(lexer, 1) == '+') + return Lexer_make_token(lexer, TOKEN_PLUSPLUS, 2); + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_PLUSEQUALS, 2); + return Lexer_make_token(lexer, TOKEN_PLUS, 1); + } + + case '-': { + if (peek(lexer, 1) == '-') + return Lexer_make_token(lexer, TOKEN_MINUSMINUS, 2); + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_MINUSEQUALS, 2); + return Lexer_make_token(lexer, TOKEN_MINUS, 1); + } + + case '/': { + if (peek(lexer, 1) == '/') { + lexer->pos += 2; // skip the '//' + while (lexer->pos < lexer->len && lexer->src[lexer->pos] != '\n') + lexer->pos++; + continue; + } + return Lexer_make_token(lexer, TOKEN_SLASH, 1); + } + + case '*': return Lexer_make_token(lexer, TOKEN_STAR, 1); + case '%': return Lexer_make_token(lexer, TOKEN_PERCENT, 1); + + + default: { + // Handle keywords explicitly + LEX_KEYWORD("fn", TOKEN_FN); + LEX_KEYWORD("if", TOKEN_IF); + LEX_KEYWORD("int", TOKEN_INT); + LEX_KEYWORD("let", TOKEN_LET); + LEX_KEYWORD("for", TOKEN_FOR); + LEX_KEYWORD("else", TOKEN_ELSE); + LEX_KEYWORD("while", TOKEN_WHILE); + LEX_KEYWORD("return", TOKEN_RETURN); + + if (isdigit(lexer->src[lexer->pos])) { + // TODO: Parse hex and octal numbers + i64 pos = lexer->pos; + while (pos < lexer->len && isdigit(lexer->src[pos])) + pos++; + Token token = Token_from_int(atoi(lexer->src + lexer->pos), Lexer_loc(lexer)); + advance(lexer, pos - lexer->pos); + return token; + } + + if (isalpha(lexer->src[lexer->pos]) || lexer->src[lexer->pos] == '_') { + i64 pos = lexer->pos; + while (pos < lexer->len && (isalnum(lexer->src[pos]) || lexer->src[pos] == '_')) + pos++; + int str_len = pos - lexer->pos; + char *str = calloc(str_len + 1, 1); + strncpy(str, lexer->src + lexer->pos, str_len); + Token token = Token_from_identifier(str, Lexer_loc(lexer)); + advance(lexer, str_len); + return token; + } + + // TODO: Handle escapes + if (lexer->src[lexer->pos] == '"') { + i64 pos = lexer->pos + 1; + while (pos < lexer->len && lexer->src[pos] != '"') + pos++; + if (pos == lexer->len) { + die_location(Lexer_loc(lexer), ": ERROR: Reached end-of-file while parsing string literal beginning here.\n"); + } + // Careful with indexing here, because we want to skip opening and closing quotes + char *str = calloc(pos - lexer->pos, 1); + strncpy(str, lexer->src + lexer->pos + 1, pos - lexer->pos - 1); + Token token = Token_from_identifier(str, Lexer_loc(lexer)); + advance(lexer, pos - lexer->pos + 1); + return token; + } + + + die_location(Lexer_loc(lexer), ": ERROR: Unexpected character '%c'\n", lexer->src[lexer->pos]); + advance(lexer, 1); + } + } + } + + return Token_from_type(TOKEN_EOF, Lexer_loc(lexer)); +} + +Token Lexer_peek(Lexer *lexer) +{ + i64 pos = lexer->pos; + i64 col = lexer->col; + i64 line = lexer->line; + Token token = Lexer_next(lexer); + lexer->pos = pos; + lexer->col = col; + lexer->line = line; + return token; +} \ No newline at end of file diff --git a/src/lexer.h b/src/lexer.h new file mode 100644 index 0000000..f710d3d --- /dev/null +++ b/src/lexer.h @@ -0,0 +1,19 @@ +#pragma once + +#include "tokens.h" +#include + +typedef struct { + char *src; + i64 len; + i64 pos; + + char *filename; + i64 line; + i64 col; +} Lexer; + +Lexer Lexer_new(char *filename, char *src, i64 len); + +Token Lexer_next(Lexer *lexer); +Token Lexer_peek(Lexer *lexer); \ No newline at end of file diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..15460ff --- /dev/null +++ b/src/main.c @@ -0,0 +1,82 @@ +#include +#include +#include +#include "lexer.h" +#include "parser.h" +#include "generator.h" +#include "unistd.h" + +char *filename = NULL; +char *source = NULL; +i64 source_len = 0; +bool dump_ast = false; + +#define MAX_STDIN_SOURCE_LEN 2048 + +void usage(int exit_status) +{ + printf("Usage: cup [options] \n"); + printf("Options:\n"); + printf(" -c Code to compile\n"); + printf(" -h Show this help\n"); + printf(" -d Dump AST to stdout\n"); + printf("Output file will be named `output.nasm`\n"); + exit(exit_status); +} + +void parse_args(int argc, char **argv) +{ + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-c") == 0) { + source_len = strlen(argv[i+1]); + source = calloc(source_len + 1, 1); + strcpy(source, argv[i+1]); + i++; + filename = "CLI"; + } else if (strcmp(argv[i], "-h") == 0) { + usage(0); + } else if (strcmp(argv[i], "-d") == 0) { + dump_ast = true; + } else if (filename == NULL) { + if (strcmp(argv[i], "-") == 0) { + filename = "stdin"; + source = calloc(MAX_STDIN_SOURCE_LEN, 1); + source_len = fread(source, 1, MAX_STDIN_SOURCE_LEN, stdin); + if (source_len == MAX_STDIN_SOURCE_LEN) { + fprintf(stderr, "Source too long to use through stdin\n"); + exit(1); + } + } else { + filename = argv[i]; + // Read entire file into memory + FILE *fp = fopen(filename, "r"); + fseek(fp, 0, SEEK_END); + source_len = ftell(fp); + fseek(fp, 0, SEEK_SET); + source = malloc(source_len + 1); + fread(source, source_len, 1, fp); + source[source_len] = 0; + fclose(fp); + } + } else { + usage(1); + } + } +} + +int main(int argc, char**argv) { + parse_args(argc, argv); + + Lexer lexer = Lexer_new(filename, source, source_len); + Node *ast = parse_program(&lexer); + + if (dump_ast) + print_ast(ast); + + FILE *f = fopen("output.nasm", "w"); + generate_asm(ast, f); + + + free(source); + return 0; +} diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..cfb5d97 --- /dev/null +++ b/src/parser.c @@ -0,0 +1,562 @@ +#include "parser.h" +#include "utils.h" +#include +#include +#include + +#define MAX_FUNCTION_COUNT 1024 +static Node *all_functions[MAX_FUNCTION_COUNT]; +static i64 function_count = 0; + +static Node *current_function = NULL; + +#define BLOCK_STACK_SIZE 64 +static Node *block_stack[BLOCK_STACK_SIZE]; +static i64 block_stack_count = 0; +static i64 cur_stack_offset = 0; + + +Token do_assert_token(Token token, TokenType type, char *filename, int line) +{ + if (token.type != type) { + Location_print(stderr, token.loc); + fprintf(stderr, ": Expected token of type `%s` but got `%s`\n", token_type_to_str(type), token_type_to_str(token.type)); + fprintf(stderr, "Relevant location in compiler: %s:%d\n", filename, line); + exit(1); + } + return token; +} + +#define assert_token(token, type) do_assert_token(token, type, __FILE__, __LINE__) + +/****** + * Some helpers + */ + +NodeType binary_token_to_op(TokenType type) +{ + switch (type) + { + case TOKEN_PLUS: return OP_PLUS; + case TOKEN_MINUS: return OP_MINUS; + case TOKEN_STAR: return OP_MUL; + case TOKEN_SLASH: return OP_DIV; + case TOKEN_PERCENT: return OP_MOD; + case TOKEN_LSHIFT: return OP_LSHIFT; + case TOKEN_RSHIFT: return OP_RSHIFT; + case TOKEN_AND: return OP_AND; + case TOKEN_OR: return OP_OR; + case TOKEN_XOR: return OP_XOR; + case TOKEN_EQ: return OP_EQ; + case TOKEN_NEQ: return OP_NEQ; + case TOKEN_LT: return OP_LT; + case TOKEN_LEQ: return OP_LEQ; + case TOKEN_GT: return OP_GT; + case TOKEN_GEQ: return OP_GEQ; + + default: assert(false && "binary_token_to_op called with invalid token type"); + } +} + +void block_stack_push(Node *block) +{ + assert(block_stack_count < BLOCK_STACK_SIZE); + assert(current_function); + block_stack[block_stack_count++] = block; +} + +void block_stack_pop() +{ + assert(block_stack_count > 0); + assert(current_function); + Node *block = block_stack[--block_stack_count]; + cur_stack_offset -= block->block.locals_size; + assert(cur_stack_offset >= 0); +} + +void dump_block_stack() +{ + for (i64 i = 0; i < block_stack_count; i++) { + Node *block = block_stack[i]; + for (int i = 0; i < block->block.num_locals; i++) { + printf("%s: offset: %lld\n", block->block.locals[i]->name, block->block.locals[i]->offset); + } + printf("\n"); + } +} + +Variable *find_local_variable(Token *token) +{ + assert_token(*token, TOKEN_IDENTIFIER); + for (i64 i = block_stack_count - 1; i >= 0; --i) { + Node *block = block_stack[i]; + for (int i = 0; i < block->block.num_locals; i++) { + if (strcmp(block->block.locals[i]->name, token->value.as_string) == 0) { + return block->block.locals[i]; + } + } + } + Node *func = current_function; + for (int i = 0; i < func->func.num_args; i++) { + if (strcmp(func->func.args[i].name, token->value.as_string) == 0) { + return &func->func.args[i]; + } + } + return NULL; +} + +Node *find_function_definition(Token *token) +{ + assert_token(*token, TOKEN_IDENTIFIER); + for (i64 i = 0; i < function_count; i++) { + Node *function = all_functions[i]; + if (strcmp(function->func.name, token->value.as_string) == 0) { + return function; + } + } + return NULL; +} + +// TODO: rename this, it's ugly +void add_variable_to_current_block(Variable *var) +{ + // Set offset for variable + Node *cur_block = block_stack[block_stack_count - 1]; + var->offset = cur_stack_offset; + + int new_len = (cur_block->block.num_locals + 1); + int var_size = 8; // TODO: Compute sizes based on different types + + // Add to the block + // FIXME: Use a map here + cur_block->block.locals = realloc(cur_block->block.locals, sizeof(Variable *) * new_len); + cur_block->block.locals[cur_block->block.num_locals] = var; + cur_block->block.num_locals++; + + assert(current_function); + // Update current stack offset (w.r.t function stack frame) and block size + cur_stack_offset += var_size; + block_stack[block_stack_count-1]->block.locals_size += var_size; + + // Update function's max locals size + i64 max_offset = i64max(current_function->func.max_locals_size, cur_stack_offset); + current_function->func.max_locals_size = max_offset; + + assert(cur_stack_offset >= 0); + assert(block_stack_count > 0); +} + +void Node_add_child(Node *parent, Node *child) +{ + // TODO, use a vector + parent->block.children = realloc(parent->block.children, sizeof(Node *) * (parent->block.num_children + 1)); + parent->block.children[parent->block.num_children] = child; + parent->block.num_children++; +} + +Node *Node_new(NodeType type) +{ + Node *self = calloc(sizeof(Node), 1); + self->type = type; + return self; +} + +Type parse_type(Lexer *lexer) +{ + Type type = {0}; + Token token = Lexer_peek(lexer); + if (token.type == TOKEN_INT) { + type.type = TYPE_INT; + Lexer_next(lexer); + } else { + type.type = TYPE_NONE; + } + + while (Lexer_peek(lexer).type == TOKEN_AMPERSAND) { + Lexer_next(lexer); + type.indirection++; + } + + return type; +} + +Node *parse_literal(Lexer *lexer) +{ + Node *node = Node_new(AST_LITERAL); + Token token = assert_token(Lexer_next(lexer), TOKEN_INTLIT); + node->literal.type = (Type) {.type = TYPE_INT}; + node->literal.as_int = token.value.as_int; + return node; +} + +Node *parse_expression(Lexer *); + +Node *parse_var_declaration(Lexer *lexer) +{ + Token token = assert_token(Lexer_next(lexer), TOKEN_LET); + // TODO: Reuse this for globals? Or maybe just make a new function? + if (!current_function || current_function->type != AST_FUNC) + die_location(token.loc, "Variable declaration outside of function"); + + token = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); + // NOTE: We don't allow shadowing of variables in the any blocks, + // this is by design since it's a common mistake. + if (find_local_variable(&token) != NULL) + die_location(token.loc, "Variable `%s` already declared", token.value.as_string); + + Node *node = Node_new(AST_VARDECL); + node->var_decl.var.name = token.value.as_string; + + token = Lexer_next(lexer); + if (token.type != TOKEN_COLON) + die_location(token.loc, "Missing type specifier for variable `%s`", node->var_decl.var.name); + node->var_decl.var.type = parse_type(lexer); + + add_variable_to_current_block(&node->var_decl.var); + + token = Lexer_next(lexer); + if (token.type == TOKEN_ASSIGN) { + node->var_decl.value = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); + } else { + assert_token(token, TOKEN_SEMICOLON); + } + + return node; +} + +Node *parse_function_call_args(Lexer *lexer, Node *func) +{ + Token identifier = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); + Node *call = Node_new(AST_FUNCCALL); + call->call.func = func; + assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); + Token token = Lexer_peek(lexer); + + while (token.type != TOKEN_CLOSE_PAREN) { + Node *arg = parse_expression(lexer); + + int new_size = call->call.num_args + 1; + call->call.args = realloc(call->call.args, sizeof(Node *) * new_size); + call->call.args[call->call.num_args++] = arg; + + if (new_size > func->func.num_args) + die_location(identifier.loc, "Too many arguments to function `%s`", func->func.name); + + token = Lexer_peek(lexer); + if (token.type == TOKEN_COMMA) { + Lexer_next(lexer); + token = Lexer_peek(lexer); + } + } + + if (call->call.num_args != func->func.num_args) + die_location(identifier.loc, "Too few arguments to function `%s`", func->func.name); + + assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); + return call; +} + +Node *parse_factor(Lexer *lexer) +{ + // TODO: Parse more complicated things + Token token = Lexer_peek(lexer); + Node *expr; + if (token.type == TOKEN_MINUS) { + Lexer_next(lexer); + expr = Node_new(OP_NEG); + expr->unary_expr = parse_factor(lexer); + } else if (token.type == TOKEN_TILDE) { + Lexer_next(lexer); + expr = Node_new(OP_BWINV); + expr->unary_expr = parse_factor(lexer); + } else if (token.type == TOKEN_EXCLAMATION) { + Lexer_next(lexer); + expr = Node_new(OP_NOT); + expr->unary_expr = parse_factor(lexer); + } else if (token.type == TOKEN_OPEN_PAREN) { + Lexer_next(lexer); + expr = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); + } else if (token.type == TOKEN_INTLIT) { + expr = parse_literal(lexer); + } else if (token.type == TOKEN_IDENTIFIER) { + // TODO: Check for global variables when added + + Variable *var = find_local_variable(&token); + if (var != NULL) { + Lexer_next(lexer); + expr = Node_new(AST_VAR); + expr->variable = var; + return expr; + } + + Node *func = find_function_definition(&token); + if (func != NULL) { + return parse_function_call_args(lexer, func); + } + + die_location(token.loc, "Unknown identifier `%s`", token.value.as_string); + expr = NULL; + } else { + die_location(token.loc, ": Expected token found in parse_factor: `%s`", token_type_to_str(token.type)); + exit(1); + } + return expr; +} + +#define BINOP_PARSER(next_parser, predicate) \ + Node *expr = next_parser(lexer); \ + Token token = Lexer_peek(lexer); \ + while (predicate(token.type)) { \ + Lexer_next(lexer); \ + Node *op = Node_new(binary_token_to_op(token.type)); \ + Node *right = next_parser(lexer); \ + op->binary.left = expr; \ + op->binary.right = right; \ + expr = op; \ + token = Lexer_peek(lexer); \ + } \ + return expr; + +bool is_term_token(TokenType type) { return type == TOKEN_STAR || type == TOKEN_SLASH || type == TOKEN_PERCENT; } +Node *parse_term(Lexer *lexer) { BINOP_PARSER(parse_factor, is_term_token); } + +bool is_additive_token(TokenType type) { return type == TOKEN_PLUS || type == TOKEN_MINUS; } +Node *parse_additive(Lexer *lexer) { BINOP_PARSER(parse_term, is_additive_token); } + +bool is_relational_token(TokenType type) { return type == TOKEN_LT || type == TOKEN_LEQ || type == TOKEN_GT || type == TOKEN_GEQ; } +Node *parse_relational(Lexer *lexer) { BINOP_PARSER(parse_additive, is_relational_token); } + +bool is_equality_token(TokenType type) { return type == TOKEN_EQ || type == TOKEN_NEQ; } +Node *parse_equality(Lexer *lexer) { BINOP_PARSER(parse_relational, is_equality_token); } + +bool is_logical_and_token(TokenType type) { return type == TOKEN_AND; } +Node *parse_logical_and(Lexer *lexer) { BINOP_PARSER(parse_equality, is_logical_and_token); } + +bool is_logical_or_token(TokenType type) { return type == TOKEN_OR; } +Node *parse_logical_or(Lexer *lexer) { BINOP_PARSER(parse_logical_and, is_logical_or_token); } + +Node *parse_conditional_exp(Lexer *lexer) +{ + Node *expr = parse_logical_or(lexer); + Token token = Lexer_peek(lexer); + if (token.type == TOKEN_QUESTION) { + Lexer_next(lexer); + Node *then_expr = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_COLON); + Node *else_expr = parse_expression(lexer); + Node *conditional = Node_new(AST_CONDITIONAL); + conditional->conditional.cond = expr; + conditional->conditional.do_then = then_expr; + conditional->conditional.do_else = else_expr; + expr = conditional; + } + return expr; +} + +Node *parse_expression(Lexer *lexer) +{ + Node *node = parse_conditional_exp(lexer); + // FIXME: This is a hack to handle assignment expressions + // and can probably be done properly. + if (node->type == AST_VAR) { + Token token = Lexer_peek(lexer); + if (token.type == TOKEN_ASSIGN) { + Lexer_next(lexer); + Variable *var = node->variable; + node->type = OP_ASSIGN; + node->assign.var = var; + node->assign.value = parse_expression(lexer); + } + } + return node; +} + +Node *parse_block(Lexer *lexer); + +Node *parse_statement(Lexer *lexer) +{ + Node *node; + Token token = Lexer_peek(lexer); + + if (token.type == TOKEN_RETURN) { + assert_token(Lexer_next(lexer), TOKEN_RETURN); + node = Node_new(AST_RETURN); + node->unary_expr = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); + } else if (token.type == TOKEN_IF) { + Lexer_next(lexer); + node = Node_new(AST_IF); + assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); + node->conditional.cond = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); + // TODO: Allow blocks in here once implemented, currently + // we can onle have a single statement in the if/else + node->conditional.do_then = parse_statement(lexer); + token = Lexer_peek(lexer); + if (token.type == TOKEN_ELSE) { + Lexer_next(lexer); + node->conditional.do_else = parse_statement(lexer); + } + } else if (token.type == TOKEN_WHILE) { + Lexer_next(lexer); + node = Node_new(AST_WHILE); + assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); + node->loop.cond = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); + node->loop.body = parse_statement(lexer); + } else if (token.type == TOKEN_FOR) { + Lexer_next(lexer); + node = Node_new(AST_FOR); + assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); + + // All of the expressions in the for loop are optional + + // TODO: Allow this to be a declaration, need to inject + // the variable into the symbol table for the block + if (Lexer_peek(lexer).type != TOKEN_SEMICOLON) + node->loop.init = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); + + if (Lexer_peek(lexer).type != TOKEN_SEMICOLON) + node->loop.cond = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); + + if (Lexer_peek(lexer).type != TOKEN_CLOSE_PAREN) + node->loop.step = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); + + node->loop.body = parse_statement(lexer); + } else if (token.type == TOKEN_OPEN_BRACE) { + node = parse_block(lexer); + } else { + // Default to trying to handle it as an expression + node = parse_expression(lexer); + assert_token(Lexer_next(lexer), TOKEN_SEMICOLON); + } + + return node; +} + +Node *parse_block(Lexer *lexer) +{ + assert_token(Lexer_next(lexer), TOKEN_OPEN_BRACE); + + Node *block = Node_new(AST_BLOCK); + Token token = Lexer_peek(lexer); + + block_stack_push(block); + + while (token.type != TOKEN_CLOSE_BRACE) { + Node *block_item; + if (token.type == TOKEN_LET) { + block_item = parse_var_declaration(lexer); + } else { + // Default to a statement + block_item = parse_statement(lexer); + } + Node_add_child(block, block_item); + token = Lexer_peek(lexer); + } + + block_stack_pop(); + + assert_token(Lexer_next(lexer), TOKEN_CLOSE_BRACE); + return block; +} + +void push_new_function(Node *func) +{ + assert(func->type == AST_FUNC); + assert(function_count < MAX_FUNCTION_COUNT); + all_functions[function_count++] = func; + current_function = func; +} + +void parse_func_args(Lexer *lexer, Node *func) +{ + assert_token(Lexer_next(lexer), TOKEN_OPEN_PAREN); + Token token = Lexer_peek(lexer); + while (token.type != TOKEN_CLOSE_PAREN) { + token = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); + // TODO: Check for shadowing with globals + assert_token(Lexer_next(lexer), TOKEN_COLON); + Type type = parse_type(lexer); + + i64 new_count = func->func.num_args + 1; + func->func.args = realloc(func->func.args, sizeof(Variable) * new_count); + Variable *var = &func->func.args[func->func.num_args++]; + var->name = token.value.as_string; + var->type = type; + + token = Lexer_peek(lexer); + if (token.type == TOKEN_COMMA) { + Lexer_next(lexer); + token = Lexer_peek(lexer); + } + } + assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); + + // Set the offsets for the arguments + + // IMPORTANT: We want to skip the saved ret_addr+old_rbp that we + // pushed on the stack. Each of these is 8 bytes. + int offset = -16; + for (int i = 0; i < func->func.num_args; i++) { + Variable *var = &func->func.args[i]; + var->offset = offset; + // TODO: Compute this for different types + int var_size = 8; + offset -= var_size; + } +} + +Node *parse_func(Lexer *lexer) +{ + Token token; + token = assert_token(Lexer_next(lexer), TOKEN_FN); + + Node *func = Node_new(AST_FUNC); + push_new_function(func); + + token = assert_token(Lexer_next(lexer), TOKEN_IDENTIFIER); + func->func.name = token.value.as_string; + parse_func_args(lexer, func); + + token = Lexer_peek(lexer); + if (token.type == TOKEN_COLON) { + // TODO: Parse all return types + assert_token(Lexer_next(lexer), TOKEN_COLON); + func->func.return_type = parse_type(lexer); + } else { + // No return type, void fn. + func->func.return_type = (Type){.type = TYPE_NONE}; + } + + // Make sure there's no funny business with the stack offset + assert(cur_stack_offset == 0); + assert(block_stack_count == 0); + func->func.body = parse_block(lexer); + assert(block_stack_count == 0); + assert(cur_stack_offset == 0); + + return func; +} + +Node *parse_program(Lexer *lexer) +{ + Node *program = Node_new(AST_PROGRAM); + Token token; + while ((token = Lexer_peek(lexer)).type != TOKEN_EOF) { + if (token.type == TOKEN_FN) { + Node *func = parse_func(lexer); + Node_add_child(program, func); + } else { + die_location(token.loc, "Unexpected token in parse_program: `%s`\n", token_type_to_str(token.type)); + exit(1); + break; + } + } + return program; +} \ No newline at end of file diff --git a/src/parser.h b/src/parser.h new file mode 100644 index 0000000..7f7dacb --- /dev/null +++ b/src/parser.h @@ -0,0 +1,7 @@ +#pragma once + +#include "ast.h" +#include "lexer.h" + +Node *parse_program(Lexer *lexer); +void print_ast(Node *node); \ No newline at end of file diff --git a/src/tokens.c b/src/tokens.c new file mode 100644 index 0000000..e562e16 --- /dev/null +++ b/src/tokens.c @@ -0,0 +1,70 @@ +#include "tokens.h" +#include +#include + +Token Token_from_type(TokenType type, Location loc) +{ + Token token = {0}; + token.type = type; + token.loc = loc; + return token; +} + +Token Token_from_int(i64 value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_INTLIT; + token.value.as_int = value; + token.loc = loc; + return token; +} + +Token Token_from_string(char *value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_STRINGLIT; + token.value.as_string = value; + token.loc = loc; + return token; +} + +Token Token_from_identifier(char *value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_IDENTIFIER; + token.value.as_string = value; + token.loc = loc; + return token; +} + +void Location_print(FILE *f, Location loc) +{ + fprintf(f, "%s:%d:%d", loc.filename, loc.line+1, loc.col+1); +} + +char *token_type_to_str(TokenType type) +{ + // Otherwise, just print the token type + switch (type) + { + #define ENUM_TOKEN(name, str) case name: return str; + ENUM_TOKENS(ENUM_TOKEN) + #undef ENUM_TOKEN + default: assert(false && "Unreachable"); + } +} + + +void Token_print(FILE *f, Token *token) +{ + // Handle the different literals manually + switch (token->type) + { + case TOKEN_INTLIT: fprintf(f, "%lld", token->value.as_int); return; + case TOKEN_STRINGLIT: fprintf(f, "\"%s\"", token->value.as_string); return; + case TOKEN_IDENTIFIER: fprintf(f, "%s", token->value.as_string); return; + default: break; + } + + fprintf(f, "%s", token_type_to_str(token->type)); +} \ No newline at end of file diff --git a/src/tokens.h b/src/tokens.h new file mode 100644 index 0000000..44f5e6e --- /dev/null +++ b/src/tokens.h @@ -0,0 +1,83 @@ +#pragma once + +#include "common.h" +#include + +#define ENUM_TOKENS(F) \ + F(TOKEN_AMPERSAND, "&") \ + F(TOKEN_AND, "&&") \ + F(TOKEN_ASSIGN, "=") \ + F(TOKEN_BAR, "|") \ + F(TOKEN_CLOSE_BRACE, "}") \ + F(TOKEN_CLOSE_PAREN, ")") \ + F(TOKEN_COLON, ":") \ + F(TOKEN_COMMA, ",") \ + F(TOKEN_ELSE, "else") \ + F(TOKEN_EOF, "EOF") \ + F(TOKEN_EQ, "==") \ + F(TOKEN_EXCLAMATION, "!") \ + F(TOKEN_FN, "fn") \ + F(TOKEN_FOR, "for") \ + F(TOKEN_GEQ, ">=") \ + F(TOKEN_GT, ">") \ + F(TOKEN_IDENTIFIER, "identifier") \ + F(TOKEN_IF, "if") \ + F(TOKEN_INT, "int") \ + F(TOKEN_INTLIT, "integer literal") \ + F(TOKEN_LEQ, "<=") \ + F(TOKEN_LSHIFT, "<<") \ + F(TOKEN_LET, "let") \ + F(TOKEN_LT, "<") \ + F(TOKEN_MINUS, "-") \ + F(TOKEN_MINUSEQUALS, "-=") \ + F(TOKEN_MINUSMINUS, "--") \ + F(TOKEN_NEQ, "!=") \ + F(TOKEN_OPEN_BRACE, "{") \ + F(TOKEN_OPEN_PAREN, "(") \ + F(TOKEN_OR, "||") \ + F(TOKEN_PERCENT, "%") \ + F(TOKEN_PLUS, "+") \ + F(TOKEN_PLUSEQUALS, "+=") \ + F(TOKEN_PLUSPLUS, "++") \ + F(TOKEN_QUESTION, "?") \ + F(TOKEN_RETURN, "return") \ + F(TOKEN_RSHIFT, ">>") \ + F(TOKEN_SEMICOLON, ";") \ + F(TOKEN_SLASH, "/") \ + F(TOKEN_STAR, "*") \ + F(TOKEN_STRINGLIT, "string literal") \ + F(TOKEN_TILDE, "~") \ + F(TOKEN_WHILE, "while") \ + F(TOKEN_XOR, "^") + +typedef enum { +#define ENUM_TOKEN(name, str) name, + ENUM_TOKENS(ENUM_TOKEN) +#undef ENUM_TOKEN +} TokenType; + +typedef struct { + char *filename; + int line, col; +} Location; + +void Location_print(FILE *f, Location loc); + +typedef struct { + TokenType type; + Location loc; + union tokens { + i64 as_int; + char *as_string; + char as_char; + } value; +} Token; + +char *token_type_to_str(TokenType type); + +Token Token_from_type(TokenType type, Location loc); +Token Token_from_int(i64 value, Location loc); +Token Token_from_string(char *value, Location loc); +Token Token_from_identifier(char *value, Location loc); + +void Token_print(FILE *f, Token *token); \ No newline at end of file diff --git a/src/utils.c b/src/utils.c new file mode 100644 index 0000000..2335981 --- /dev/null +++ b/src/utils.c @@ -0,0 +1,30 @@ +#include "utils.h" + +#include +#include +#include + +void die(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + exit(1); +} + +void _die_location(char *file, int line, Location loc, const char *fmt, ...) +{ + Location_print(stderr, loc); + fprintf(stderr, ": "); + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + fprintf(stderr, "\n"); + fprintf(stderr, "NOTE: Error occurred in %s:%d\n", file, line); + exit(1); +} + +i64 i64max(i64 a, i64 b) { return a > b ? a : b; } +i64 i64min(i64 a, i64 b) { return a < b ? a : b; } \ No newline at end of file diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..dfc018c --- /dev/null +++ b/src/utils.h @@ -0,0 +1,12 @@ +#pragma once + +#include "common.h" +#include "tokens.h" + +void die(const char *fmt, ...); +void _die_location(char *file, int line, Location loc, const char *fmt, ...); + +i64 i64max(i64 a, i64 b); +i64 i64min(i64 a, i64 b); + +#define die_location(loc, ...) _die_location(__FILE__, __LINE__, loc, __VA_ARGS__) \ No newline at end of file -- cgit v1.2.3