aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMustafa Quraish <[email protected]>2022-02-05 08:23:14 -0500
committerMustafa Quraish <[email protected]>2022-02-05 08:56:15 -0500
commitaeaf92127d1c090f9281616e49ad10dda414bd45 (patch)
treef85127c08b0caa13b95b3fb80e2996d3b5186434
parentRemove old test which disallowed initializing globals (diff)
downloadcup-aeaf92127d1c090f9281616e49ad10dda414bd45.tar.xz
cup-aeaf92127d1c090f9281616e49ad10dda414bd45.zip
Add implementation of self-hosted compiler so far
There's also a `run.sh2` script which does the following: - Compiles the C compiler `build/cupcc` - Compiles the self-hosted compiler `build/cup.out` (with `cupcc`) - Compiles the specified file on CLI with `build/cup.out` - Runs this exectuable and shows the output
-rw-r--r--compiler/README.md3
-rw-r--r--compiler/ast.cup317
-rw-r--r--compiler/codegen.cup151
-rw-r--r--compiler/lexer.cup288
-rw-r--r--compiler/main.cup34
-rw-r--r--compiler/parser.cup433
-rw-r--r--compiler/tokens.cup238
-rw-r--r--compiler/types.cup82
-rwxr-xr-xrun.sh228
9 files changed, 1574 insertions, 0 deletions
diff --git a/compiler/README.md b/compiler/README.md
new file mode 100644
index 0000000..0e58d43
--- /dev/null
+++ b/compiler/README.md
@@ -0,0 +1,3 @@
+# CUP Compiler in CUP
+
+This is the beginnings of a CUP compiler written in itself. \ No newline at end of file
diff --git a/compiler/ast.cup b/compiler/ast.cup
new file mode 100644
index 0000000..689f7fb
--- /dev/null
+++ b/compiler/ast.cup
@@ -0,0 +1,317 @@
+import "std/vector.cup"
+import "compiler/types.cup"
+
+enum NodeType {
+ // Unary
+ AST_NEG,
+ AST_NOT,
+ AST_BWINV,
+ AST_ADDROF,
+ AST_DEREF,
+ // Binary
+ AST_PLUS,
+ AST_MINUS,
+ AST_MUL,
+ AST_DIV,
+ AST_MOD,
+ AST_LSHIFT,
+ AST_RSHIFT,
+ AST_AND,
+ AST_BWAND,
+ AST_OR,
+ AST_BWOR,
+ AST_XOR,
+ // Comparison
+ AST_EQ,
+ AST_NEQ,
+ AST_LT,
+ AST_LEQ,
+ AST_GT,
+ AST_GEQ,
+ // Misc.
+ AST_ASSIGN,
+ AST_MEMBER,
+ // AST types
+ AST_LITERAL,
+ AST_CONSTANT,
+ AST_FUNCCALL,
+ AST_CONDITIONAL,
+ AST_IF,
+ AST_WHILE,
+ AST_DEFER,
+ AST_FOR,
+ AST_VARDECL,
+ AST_LOCAL_VAR,
+ AST_GLOBAL_VAR,
+ AST_RETURN,
+ AST_FUNC,
+ AST_BUILTIN,
+ AST_PROGRAM,
+ AST_BLOCK,
+};
+
+struct Variable {
+ name: char *;
+ typ: Type *;
+ offset: int;
+};
+
+struct Node {
+ typ: int; // NodeType
+ etyp: Type*; // Expression type
+
+ // TODO: Anonymous union members so we can do `Node.binary`, etc.
+ d: union {
+ binary: struct {
+ lhs: Node *;
+ rhs: Node *;
+ };
+
+ unary: Node *;
+
+ func: struct {
+ name: char *;
+ body: Node *;
+ max_locals_size: int;
+ args: Vector *; // Vector<Variable>
+ };
+
+ block: struct {
+ children: Vector *; // Vector<Node *>
+ locals: Vector *; // Vector<Variable>
+ locals_size: int;
+ };
+
+ literal: union {
+ as_int: int;
+ as_char: char;
+ as_string: char *;
+ };
+
+ var_decl: struct {
+ var: Variable;
+ init: Node *;
+ };
+
+ assign: struct {
+ lhs: Node *;
+ rhs: Node *;
+ };
+
+ conditional: struct {
+ cond: Node *;
+ then: Node *;
+ els: Node *;
+ };
+
+ // `loop` is keyword in rust, syntax highlighting breaks
+ looop: struct {
+ cond: Node *;
+ body: Node *;
+ // for loop:
+ init: Node *;
+ step: Node *;
+ };
+
+ variable: Variable *;
+
+ call: struct {
+ func: Node *;
+ args: Vector *; // Vector<Node *>
+ };
+
+ member: struct {
+ obj: Node *;
+ offset: int;
+ is_ptr: int;
+ };
+
+ constant: struct {
+ name: char *;
+ value: Node *; // Must be int literal
+ };
+ };
+};
+
+let node_counter = 0;
+
+fn node_new(typ: int): Node* {
+ let node: Node* = malloc(sizeof(Node));
+ ++node_counter;
+ node.typ = typ;
+ return node;
+}
+
+fn node_from_int_literal(val: int): Node* {
+ let node: Node* = node_new(AST_LITERAL);
+ node.etyp = type_new(TYPE_INT);
+ node.d.literal.as_int = val;
+ return node;
+}
+
+fn block_add_child(block: Node*, child: Node*) {
+ if (block.d.block.children == null)
+ block.d.block.children = vector_new();
+ vector_push(block.d.block.children, child);
+}
+
+// TODO: Careful here, the input type here is the same as `type_to_string`
+fn node_type_to_string(typ: int): char* {
+ if (typ == AST_NEG) return "AST_NEG";
+ if (typ == AST_NOT) return "AST_NOT";
+ if (typ == AST_BWINV) return "AST_BWINV";
+ if (typ == AST_ADDROF) return "AST_ADDROF";
+ if (typ == AST_DEREF) return "AST_DEREF";
+ if (typ == AST_PLUS) return "AST_PLUS";
+ if (typ == AST_MINUS) return "AST_MINUS";
+ if (typ == AST_MUL) return "AST_MUL";
+ if (typ == AST_DIV) return "AST_DIV";
+ if (typ == AST_MOD) return "AST_MOD";
+ if (typ == AST_LSHIFT) return "AST_LSHIFT";
+ if (typ == AST_RSHIFT) return "AST_RSHIFT";
+ if (typ == AST_AND) return "AST_AND";
+ if (typ == AST_BWAND) return "AST_BWAND";
+ if (typ == AST_OR) return "AST_OR";
+ if (typ == AST_BWOR) return "AST_BWOR";
+ if (typ == AST_XOR) return "AST_XOR";
+ if (typ == AST_EQ) return "AST_EQ";
+ if (typ == AST_NEQ) return "AST_NEQ";
+ if (typ == AST_LT) return "AST_LT";
+ if (typ == AST_LEQ) return "AST_LEQ";
+ if (typ == AST_GT) return "AST_GT";
+ if (typ == AST_GEQ) return "AST_GEQ";
+ if (typ == AST_ASSIGN) return "AST_ASSIGN";
+ if (typ == AST_MEMBER) return "AST_MEMBER";
+ if (typ == AST_LITERAL) return "AST_LITERAL";
+ if (typ == AST_CONSTANT) return "AST_CONSTANT";
+ if (typ == AST_FUNCCALL) return "AST_FUNCCALL";
+ if (typ == AST_CONDITIONAL) return "AST_CONDITIONAL";
+ if (typ == AST_IF) return "AST_IF";
+ if (typ == AST_WHILE) return "AST_WHILE";
+ if (typ == AST_DEFER) return "AST_DEFER";
+ if (typ == AST_FOR) return "AST_FOR";
+ if (typ == AST_VARDECL) return "AST_VARDECL";
+ if (typ == AST_LOCAL_VAR) return "AST_LOCAL_VAR";
+ if (typ == AST_GLOBAL_VAR) return "AST_GLOBAL_VAR";
+ if (typ == AST_RETURN) return "AST_RETURN";
+ if (typ == AST_FUNC) return "AST_FUNC";
+ if (typ == AST_BUILTIN) return "AST_BUILTIN";
+ if (typ == AST_PROGRAM) return "AST_PROGRAM";
+ if (typ == AST_BLOCK) return "AST_BLOCK";
+
+ puts("Unknown node type in node_type_to_string: ");
+ putu(typ); putc('\n');
+ exit(1);
+}
+
+fn is_binary_op(typ: int): int {
+ if (typ == AST_PLUS) return true;
+ if (typ == AST_MINUS) return true;
+ if (typ == AST_MUL) return true;
+ if (typ == AST_DIV) return true;
+ if (typ == AST_MOD) return true;
+ if (typ == AST_LSHIFT) return true;
+ if (typ == AST_RSHIFT) return true;
+ if (typ == AST_AND) return true;
+ if (typ == AST_BWAND) return true;
+ if (typ == AST_OR) return true;
+ if (typ == AST_BWOR) return true;
+ if (typ == AST_XOR) return true;
+ if (typ == AST_EQ) return true;
+ if (typ == AST_NEQ) return true;
+ if (typ == AST_LT) return true;
+ if (typ == AST_LEQ) return true;
+ if (typ == AST_GT) return true;
+ if (typ == AST_GEQ) return true;
+ return false;
+}
+
+fn is_unary_op(typ: int): int {
+ if (typ == AST_NEG) return true;
+ if (typ == AST_NOT) return true;
+ if (typ == AST_BWINV) return true;
+ if (typ == AST_ADDROF) return true;
+ if (typ == AST_DEREF) return true;
+ return false;
+}
+
+fn is_lvalue(typ: int): int {
+ if (typ == AST_LOCAL_VAR) return true;
+ if (typ == AST_GLOBAL_VAR) return true;
+ if (typ == AST_MEMBER) return true;
+ if (typ == AST_DEREF) return true;
+ return false;
+}
+
+fn binary_token_to_op(token_typ: int): int
+{
+ if (token_typ == TOKEN_PLUS) return AST_PLUS;
+ if (token_typ == TOKEN_MINUS) return AST_MINUS;
+ if (token_typ == TOKEN_STAR) return AST_MUL;
+ if (token_typ == TOKEN_SLASH) return AST_DIV;
+ if (token_typ == TOKEN_PERCENT) return AST_MOD;
+ if (token_typ == TOKEN_LSHIFT) return AST_LSHIFT;
+ if (token_typ == TOKEN_RSHIFT) return AST_RSHIFT;
+ if (token_typ == TOKEN_AND) return AST_AND;
+ if (token_typ == TOKEN_OR) return AST_OR;
+ if (token_typ == TOKEN_XOR) return AST_XOR;
+ if (token_typ == TOKEN_EQ) return AST_EQ;
+ if (token_typ == TOKEN_NEQ) return AST_NEQ;
+ if (token_typ == TOKEN_LT) return AST_LT;
+ if (token_typ == TOKEN_LEQ) return AST_LEQ;
+ if (token_typ == TOKEN_GT) return AST_GT;
+ if (token_typ == TOKEN_GEQ) return AST_GEQ;
+ if (token_typ == TOKEN_AMPERSAND) return AST_BWAND;
+ if (token_typ == TOKEN_BAR) return AST_BWOR;
+ if (token_typ == TOKEN_CARET) return AST_XOR;
+
+ puts("Unknown token in binary_token_to_op: ");
+ putsln(token_type_to_string(token_typ));
+ exit(1);
+}
+
+fn dump_ast(node: Node*, depth: int) {
+ for (let i = 0; i < 2*depth; ++i)
+ putc(' ');
+ if (node.typ == AST_PROGRAM || node.typ == AST_BLOCK) {
+ putsln(node_type_to_string(node.typ));
+ for (let i = 0; i < node.d.block.children.size; ++i) {
+ dump_ast(node.d.block.children.data[i], depth + 1);
+ }
+ } else if (is_binary_op(node.typ)) {
+ putsln(node_type_to_string(node.typ));
+ dump_ast(node.d.binary.lhs, depth + 1);
+ dump_ast(node.d.binary.rhs, depth + 1);
+ } else if (is_unary_op(node.typ) || node.typ == AST_RETURN) {
+ putsln(node_type_to_string(node.typ));
+ dump_ast(node.d.unary, depth + 1);
+
+ } else if (node.typ == AST_LITERAL) {
+ if (node.etyp.typ == TYPE_INT) {
+ putu(node.d.literal.as_int); putc('\n');
+ } else if (node.etyp.typ == TYPE_PTR) {
+ putc('"'); puts(node.d.literal.as_string); putc('"'); putc('\n');
+ } else if (node.etyp.typ == TYPE_CHAR) {
+ putc('\''); putc(node.d.literal.as_char); putc('\''); putc('\n');
+ } else {
+ die("Unknown literal type in dump_ast");
+ }
+ } else if (node.typ == AST_FUNC) {
+ puts("func "); puts(node.d.func.name); puts("()\n");
+ dump_ast(node.d.func.body, depth + 1);
+ } else if (node.typ == AST_VARDECL) {
+ puts("let "); puts(node.d.var_decl.var.name);
+ if (node.d.var_decl.var.typ == TYPE_PTR) {
+ puts(": ");
+ puts(create_type_string(node.d.var_decl.var.typ));
+ }
+ if (node.d.var_decl.init) {
+ puts(" =\n");
+ dump_ast(node.d.var_decl.init, depth + 1);
+ } else {
+ putc('\n');
+ }
+ } else {
+ putsln(node_type_to_string(node.typ));
+ }
+} \ No newline at end of file
diff --git a/compiler/codegen.cup b/compiler/codegen.cup
new file mode 100644
index 0000000..41eea33
--- /dev/null
+++ b/compiler/codegen.cup
@@ -0,0 +1,151 @@
+import "compiler/ast.cup"
+import "std/file.cup"
+
+let gen_out_file: File*;
+
+fn emit_asm4(msg1: char*, msg2: char*, msg3: char*, msg4: char*) {
+ fwrite(gen_out_file, msg1, strlen(msg1));
+ fwrite(gen_out_file, msg2, strlen(msg2));
+ fwrite(gen_out_file, msg3, strlen(msg3));
+ fwrite(gen_out_file, msg4, strlen(msg4));
+}
+
+fn emit_asm3(msg1: char*, msg2: char*, msg3: char*) {
+ fwrite(gen_out_file, msg1, strlen(msg1));
+ fwrite(gen_out_file, msg2, strlen(msg2));
+ fwrite(gen_out_file, msg3, strlen(msg3));
+}
+
+fn emit_asm2(msg1: char*, msg2: char*) {
+ fwrite(gen_out_file, msg1, strlen(msg1));
+ fwrite(gen_out_file, msg2, strlen(msg2));
+}
+
+fn emit_asm(msg: char*) {
+ fwrite(gen_out_file, msg, strlen(msg));
+}
+
+fn emit_num(num: int) {
+ fputu(gen_out_file, num);
+}
+
+fn generate_syscall(num: int) {
+ emit_asm(" mov rax, "); emit_num(num); emit_asm("\n");
+ emit_asm(" syscall\n");
+}
+
+fn generate_expr_into_rax(node: Node*) {
+ if (node.typ == AST_LITERAL) {
+ if (node.etyp.typ == TYPE_INT) {
+ emit_asm(" mov rax, "); emit_num(node.d.literal.as_int); emit_asm("\n");
+ } else {
+ die("Unsupported literal type in generate_expr_into_rax");
+ }
+ } else if (node.typ == AST_PLUS) {
+ generate_expr_into_rax(node.d.binary.rhs);
+ emit_asm(" push rax\n");
+ generate_expr_into_rax(node.d.binary.lhs);
+ emit_asm(" pop rbx\n");
+ emit_asm(" add rax, rbx\n");
+ } else if (node.typ == AST_MINUS) {
+ generate_expr_into_rax(node.d.binary.rhs);
+ emit_asm(" push rax\n");
+ generate_expr_into_rax(node.d.binary.lhs);
+ emit_asm(" pop rbx\n");
+ emit_asm(" sub rax, rbx\n");
+ } else if (node.typ == AST_DIV) {
+ generate_expr_into_rax(node.d.binary.rhs);
+ emit_asm(" push rax\n");
+ generate_expr_into_rax(node.d.binary.lhs);
+ emit_asm(" pop rbx\n");
+ emit_asm(" cqo\n");
+ emit_asm(" idiv rbx\n");
+
+ } else if (node.typ == AST_MOD) {
+ generate_expr_into_rax(node.d.binary.rhs);
+ emit_asm(" push rax\n");
+ generate_expr_into_rax(node.d.binary.lhs);
+ emit_asm(" pop rbx\n");
+ emit_asm(" cqo\n");
+ emit_asm(" idiv rbx\n");
+ emit_asm(" mov rax, rdx\n");
+
+ } else if (node.typ == AST_MUL) {
+ generate_expr_into_rax(node.d.binary.rhs);
+ emit_asm(" push rax\n");
+ generate_expr_into_rax(node.d.binary.lhs);
+ emit_asm(" pop rbx\n");
+ emit_asm(" imul rbx\n");
+ }
+}
+
+fn generate_statement(node: Node*) {
+ if (node.typ == AST_RETURN) {
+ generate_expr_into_rax(node.d.unary);
+ emit_asm(" mov rsp, rbp\n");
+ emit_asm(" pop rbp\n");
+ emit_asm(" ret\n");
+ }
+}
+
+fn generate_block(node: Node*) {
+ let n = node.d.block.children.size;
+ for (let i = 0; i < n; ++i) {
+ generate_statement(node.d.block.children.data[i]);
+ }
+}
+
+fn generate_function(node: Node*) {
+ emit_asm3("global func_", node.d.func.name, "\n");
+ emit_asm3("func_", node.d.func.name, ":\n");
+ emit_asm(" push rbp\n");
+ emit_asm(" mov rbp, rsp\n");
+ emit_asm(" sub rsp, "); emit_num(node.d.func.max_locals_size); emit_asm("\n");
+
+ generate_block(node.d.func.body);
+
+ emit_asm(" mov rsp, rbp\n");
+ emit_asm(" pop rbp\n");
+ emit_asm(" ret\n");
+}
+
+fn generate_program(ast: Node*, file: File*) {
+ gen_out_file = file;
+
+ let n = ast.d.block.children.size;
+ for (let i = 0; i < n; ++i) {
+ let node: Node* = ast.d.block.children.data[i];
+ if (node.typ == AST_FUNC) {
+ generate_function(node);
+ } else {
+ die("Unknown node type in generate_program");
+ }
+ }
+
+ if (OS_IS_MACOS) {
+ emit_asm("global _main\n");
+ emit_asm("_main:\n");
+ // Push argv
+ emit_asm(" mov rax, rsi\n");
+ emit_asm(" push rax\n");
+ // Push argc
+ emit_asm(" mov rax, rdi\n");
+ emit_asm(" push rax\n");
+ } else {
+ emit_asm("global _start\n");
+ emit_asm("_start:\n");
+
+ emit_asm(" mov rbp, rsp\n");
+ // // Push argv
+ emit_asm(" mov rax, rbp\n");
+ emit_asm(" add rax, 8\n");
+ emit_asm(" push rax\n");
+ // Push argc
+ emit_asm(" mov rax, [rbp]\n");
+ emit_asm(" push rax\n");
+ }
+
+ emit_asm(" call func_main\n");
+ emit_asm(" mov rdi, rax\n");
+ generate_syscall(SYS_exit);
+} \ No newline at end of file
diff --git a/compiler/lexer.cup b/compiler/lexer.cup
new file mode 100644
index 0000000..ff22d8f
--- /dev/null
+++ b/compiler/lexer.cup
@@ -0,0 +1,288 @@
+import "compiler/tokens.cup"
+
+struct Lexer {
+ src: char*;
+ len: int;
+ pos: int;
+
+ filename: char*;
+ line: int;
+ col: int;
+};
+
+fn lexer_new(filename: char*, src: char*, len: int): Lexer* {
+ let lexer: Lexer* = malloc(sizeof(Lexer));
+ lexer.filename = filename;
+ lexer.src = src;
+ lexer.len = len;
+ return lexer;
+}
+
+fn lexer_loc(lexer: Lexer*, loc: Location*) {
+ loc.filename = lexer.filename;
+ loc.line = lexer.line;
+ loc.col = lexer.col;
+}
+
+fn is_space(c: char): int {
+ return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+}
+
+fn is_digit(c: char): int {
+ return c >= '0' && c <= '9';
+}
+
+fn is_alpha(c: char): int {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
+}
+
+fn is_alnum(c: char): int {
+ return is_digit(c) || is_alpha(c);
+}
+
+fn lexer_skip_whitespace(lexer: Lexer*) {
+ while (lexer.pos < lexer.len && is_space(lexer.src[lexer.pos])) {
+ if (lexer.src[lexer.pos] == '\n') {
+ lexer.line = lexer.pos + 1;
+ lexer.col = 0;
+ } else {
+ lexer.col = lexer.col + 1;
+ }
+ lexer.pos = lexer.pos + 1;
+ }
+}
+
+fn lexer_starts_with(lexer: Lexer*, str: char*): int {
+ let len = strlen(str);
+ if (lexer.len - lexer.pos < len)
+ return 0;
+ for (let i = 0; i < len; ++i)
+ if (lexer.src[lexer.pos + i] != str[i])
+ return 0;
+ let end_pos = lexer.pos + len;
+ if (end_pos == lexer.len)
+ return len;
+ let end_char = lexer.src[end_pos];
+ return !(is_digit(end_char) || is_alpha(end_char));
+}
+
+fn lexer_advance(lexer: Lexer*, n: int) {
+ lexer.pos = lexer.pos + n;
+ lexer.col = lexer.col + n;
+}
+
+fn lexer_peek_char(lexer: Lexer*, n: int): char {
+ if (lexer.pos + n >= lexer.len)
+ return 0;
+ return lexer.src[lexer.pos + n];
+}
+
+fn lexer_make_token(lexer: Lexer*, token: Token*, typ: int, inc: int) {
+ lexer_loc(lexer, &token.loc);
+ lexer_advance(lexer, inc);
+ token.typ = typ;
+}
+
+fn lexer_next(lexer: Lexer*, token: Token*) {
+ while (lexer.pos < lexer.len) {
+ putsln("101.1");
+ print(lexer.pos);
+ print(lexer.len);
+ let c = lexer.src[lexer.pos];
+ putc(c);
+ putc('\n');
+
+ if (c == '\n') { ++lexer.line; lexer.col = 0; ++lexer.pos; }
+ else if (is_space(c)) { lexer_advance(lexer, 1); }
+ else if (c == '(') { return lexer_make_token(lexer, token, TOKEN_OPEN_PAREN, 1); }
+ else if (c == ')') { return lexer_make_token(lexer, token, TOKEN_CLOSE_PAREN, 1); }
+ else if (c == '{') { return lexer_make_token(lexer, token, TOKEN_OPEN_BRACE, 1); }
+ else if (c == '}') { return lexer_make_token(lexer, token, TOKEN_CLOSE_BRACE, 1); }
+ else if (c == '[') { return lexer_make_token(lexer, token, TOKEN_OPEN_BRACKET, 1); }
+ else if (c == ']') { return lexer_make_token(lexer, token, TOKEN_CLOSE_BRACKET, 1); }
+ else if (c == ';') { return lexer_make_token(lexer, token, TOKEN_SEMICOLON, 1); }
+ else if (c == ':') { return lexer_make_token(lexer, token, TOKEN_COLON, 1); }
+ else if (c == '~') { return lexer_make_token(lexer, token, TOKEN_TILDE, 1); }
+ else if (c == '?') { return lexer_make_token(lexer, token, TOKEN_QUESTION, 1); }
+ else if (c == '^') { return lexer_make_token(lexer, token, TOKEN_CARET, 1); }
+ else if (c == '.') { return lexer_make_token(lexer, token, TOKEN_DOT, 1); }
+ else if (c == ',') { return lexer_make_token(lexer, token, TOKEN_COMMA, 1); }
+ else if (c == '*') { return lexer_make_token(lexer, token, TOKEN_STAR, 1); }
+ else if (c == '%') { return lexer_make_token(lexer, token, TOKEN_PERCENT, 1); }
+
+ else if (c == '/' && lexer_peek_char(lexer, 1) == '/') {
+ lexer.pos = lexer.pos + 2; // skip the '//'
+ while (lexer.pos < lexer.len && lexer.src[lexer.pos] != '\n')
+ ++lexer.pos;
+ // Implicit `continue`
+ }
+
+ // This needs to go after the comment check.
+ else if (c == '/') {
+ return lexer_make_token(lexer, token, TOKEN_SLASH, 1);
+ }
+
+ else if (c == '&') {
+ if (lexer_peek_char(lexer, 1) == '&')
+ return lexer_make_token(lexer, token, TOKEN_AND, 2);
+ return lexer_make_token(lexer, token, TOKEN_AMPERSAND, 1);
+ }
+
+ else if (c == '!') {
+ if (lexer_peek_char(lexer, 1) == '=')
+ return lexer_make_token(lexer, token, TOKEN_NEQ, 2);
+ return lexer_make_token(lexer, token, TOKEN_EXCLAMATION, 1);
+ }
+
+ else if (c == '<') {
+ if (lexer_peek_char(lexer, 1) == '=')
+ return lexer_make_token(lexer, token, TOKEN_LEQ, 2);
+ return lexer_make_token(lexer, token, TOKEN_LT, 1);
+ }
+
+ else if (c == '>') {
+ if (lexer_peek_char(lexer, 1) == '=')
+ return lexer_make_token(lexer, token, TOKEN_GEQ, 2);
+ return lexer_make_token(lexer, token, TOKEN_GT, 1);
+ }
+
+ else if (c == '=') {
+ if (lexer_peek_char(lexer, 1) == '=')
+ return lexer_make_token(lexer, token, TOKEN_EQ, 2);
+ return lexer_make_token(lexer, token, TOKEN_ASSIGN, 1);
+ }
+
+ else if (c == '|') {
+ if (lexer_peek_char(lexer, 1) == '|')
+ return lexer_make_token(lexer, token, TOKEN_OR, 2);
+ return lexer_make_token(lexer, token, TOKEN_BAR, 1);
+ }
+
+
+ else if (c == '+') {
+ if (lexer_peek_char(lexer, 1) == '+')
+ return lexer_make_token(lexer, token, TOKEN_PLUSPLUS, 2);
+ if (lexer_peek_char(lexer, 1) == '=')
+ return lexer_make_token(lexer, token, TOKEN_PLUSEQUALS, 2);
+ return lexer_make_token(lexer, token, TOKEN_PLUS, 1);
+ }
+
+ else if (c == '-') {
+ if (lexer_peek_char(lexer, 1) == '-')
+ return lexer_make_token(lexer, token, TOKEN_MINUSMINUS, 2);
+ if (lexer_peek_char(lexer, 1) == '=')
+ return lexer_make_token(lexer, token, TOKEN_MINUSEQUALS, 2);
+ return lexer_make_token(lexer, token, TOKEN_MINUS, 1);
+ }
+
+ else {
+ // Parse the keywords...
+ for (let i = TOKEN__KEYWORD_BEGIN+1; i < TOKEN__KEYWORD_END; ++i) {
+ let str = keyword_to_string(i);
+ if (lexer_starts_with(lexer, str)) {
+ return lexer_make_token(lexer, token, i, strlen(str));
+ }
+ }
+
+ // Parse numbers:
+ if (is_digit(c)) {
+ // TODO: Parse hex and octal numbers
+ let pos = lexer.pos;
+ while (pos < lexer.len && is_digit(lexer.src[pos]))
+ ++pos;
+ let loc: Location;
+ lexer_loc(lexer, &loc);
+ token_from_int(token, atoi(lexer.src + lexer.pos), &loc);
+ lexer_advance(lexer, pos - lexer.pos);
+ return;
+ }
+
+ // Parse identifiers:
+ if (is_alpha(lexer.src[lexer.pos])) {
+ let pos = lexer.pos;
+ while (pos < lexer.len && is_alnum(lexer.src[pos]))
+ ++pos;
+ let str_len = pos - lexer.pos;
+ let str: char* = malloc(str_len + 1);
+ memcpy(str, lexer.src + lexer.pos, str_len);
+ str[str_len] = '\0';
+ let loc: Location;
+ lexer_loc(lexer, &loc);
+ token_from_identifier(token, str, &loc);
+ lexer_advance(lexer, str_len);
+ return;
+ }
+
+ if (c == '"') {
+ let pos = lexer.pos + 1;
+ while (pos < lexer.len && lexer.src[pos] != '"')
+ ++pos;
+
+ let loc: Location;
+ lexer_loc(lexer, &loc);
+
+ if (pos == lexer.len)
+ die_loc(&loc, "EOF while parsing string literal");
+
+ // Careful with indexing here, because we want to skip opening and closing quotes
+ let str_len = pos - lexer.pos - 1;
+ let str: char* = malloc(str_len + 1);
+ memcpy(str, lexer.src + lexer.pos + 1, str_len);
+ str[str_len] = '\0';
+ token_from_string(token, str, &loc);
+ lexer_advance(lexer, pos - lexer.pos + 1);
+ return;
+ }
+
+ if (c == '\'') {
+ let pos = lexer.pos + 1;
+ // TODO: Handle malformed / incomplete literals
+ // TODO: Handle escapes
+ c = lexer.src[pos];
+ if (c == '\\') {
+ ++pos;
+ c = lexer.src[pos];
+ if (c == 'n') { c = '\n'; }
+ else if (c == 't') { c = '\t'; }
+ else if (c == 'n') { c = '\n'; }
+ else if (c == 'r') { c = '\r'; }
+ else if (c == 't') { c = '\t'; }
+ else if (c == '0') { c = '\0'; }
+ else { }
+ // TODO: Handle octal and hex escapes
+ }
+
+ let loc: Location;
+ lexer_loc(lexer, &loc);
+ token_from_char(token, c, &loc);
+ lexer_advance(lexer, pos - lexer.pos + 2);
+ return;
+ }
+
+ puts("Unknown character in lexer_next: '"); putc(c); putsln("'");
+ die("Exiting");
+ }
+ }
+ return lexer_make_token(lexer, token, TOKEN_EOF, 0);
+}
+
+fn lexer_next_assert(lexer: Lexer*, token: Token*, expected: int) {
+ lexer_next(lexer, token);
+ if (token.typ != expected) {
+ location_print(&token.loc);
+ puts(": Expected "); puts(token_type_to_string(expected));
+ puts(" but got "); puts(token_type_to_string(token.typ));
+ putc('\n');
+ exit(1);
+ }
+}
+
+fn lexer_peek(lexer: Lexer*, token: Token*) {
+ let pos = lexer.pos;
+ let col = lexer.col;
+ let line = lexer.line;
+ lexer_next(lexer, token);
+ lexer.pos = pos;
+ lexer.col = col;
+ lexer.line = line;
+} \ No newline at end of file
diff --git a/compiler/main.cup b/compiler/main.cup
new file mode 100644
index 0000000..a0a3476
--- /dev/null
+++ b/compiler/main.cup
@@ -0,0 +1,34 @@
+import "std/file.cup"
+import "compiler/lexer.cup"
+import "compiler/parser.cup"
+import "compiler/codegen.cup"
+
+fn main(argc: int, argv: char **): int {
+ if (argc != 2)
+ die("Usage: cupcc <input_file>");
+
+ let input_file = fopen(argv[1], 'r');
+ defer fclose(input_file);
+
+ // using `fmap` here doesn't work on linux, for some reason.
+ let file_size = fsize(input_file);
+ let src: char* = malloc(file_size+1);
+ fread(input_file, src, file_size);
+ src[file_size] = '\0';
+
+ let lexer = lexer_new(argv[1], src, file_size);
+ let ast = parse_program(lexer);
+
+ dump_ast(ast, 0);
+
+ let out_file = fopen("build/host.nasm", 'w');
+ defer fclose(out_file);
+
+ generate_program(ast, out_file);
+
+ puts("---------------------------\n");
+
+ puts("Total amount of memory used by malloc: ");
+ putu(__malloc_buf_pos);
+ putsln("\nDone.");
+} \ No newline at end of file
diff --git a/compiler/parser.cup b/compiler/parser.cup
new file mode 100644
index 0000000..48e4514
--- /dev/null
+++ b/compiler/parser.cup
@@ -0,0 +1,433 @@
+import "compiler/ast.cup"
+import "compiler/lexer.cup"
+
+// p_ prefix for parser global variables.
+
+let p_all_functions = vector_new();
+
+let p_block_stack = vector_new();
+let p_cur_stack_offset = 0;
+
+fn parse_literal(lexer: Lexer*): Node* {
+ let token: Token;
+ lexer_next(lexer, &token);
+ let node = node_new(AST_LITERAL);
+
+ if (token.typ == TOKEN_INTLIT) {
+ node.d.literal.as_int = token.value.as_int;
+ node.etyp = type_new(TYPE_INT);
+ } else if (token.typ == TOKEN_STRINGLIT) {
+ node.d.literal.as_string = token.value.as_string;
+ node.etyp = type_new_ptr(TYPE_CHAR);
+ } else if (token.typ == TOKEN_CHARLIT) {
+ node.d.literal.as_char = token.value.as_char;
+ node.etyp = type_new(TYPE_CHAR);
+ } else {
+ die_loc2(&token.loc, "Unexpected token in parse_literal: ", token_type_to_string(token.typ));
+ }
+ return node;
+}
+
+fn parse_type(lexer: Lexer*): Type* {
+ let token: Token;
+ let typ: Type *;
+ lexer_peek(lexer, &token);
+ if (token.typ == TOKEN_INT) {
+ lexer_next(lexer, &token);
+ typ = type_new(TYPE_INT);
+ } else if (token.typ == TOKEN_CHAR) {
+ lexer_next(lexer, &token);
+ typ = type_new(TYPE_CHAR);
+ } else if (token.typ == TOKEN_VOID) {
+ lexer_next(lexer, &token);
+ typ = type_new(TYPE_VOID);
+ }
+
+ let running = true;
+ while (running) {
+ lexer_peek(lexer, &token);
+ if (token.typ == TOKEN_STAR) {
+ lexer_next(lexer, &token);
+ let ptr = type_new(TYPE_PTR);
+ ptr.ptr = typ;
+ typ = ptr;
+ } else if (token.typ == TOKEN_OPEN_BRACKET) {
+ die("Array types not yet implemented");
+ } else {
+ running = false;
+ }
+ }
+ return typ;
+}
+
+// pragma region expressions
+fn parse_expression(lexer: Lexer*): Node*;
+
+fn parse_factor(lexer: Lexer*): Node* {
+ let token: Token;
+ let expr: Node*;
+ lexer_peek(lexer, &token);
+
+ if (token.typ == TOKEN_MINUS) {
+ lexer_next(lexer, &token);
+ expr = node_new(AST_NEG);
+ expr.d.unary = parse_factor(lexer);
+
+ } else if (token.typ == TOKEN_TILDE) {
+ lexer_next(lexer, &token);
+ expr = node_new(AST_BWINV);
+ expr.d.unary = parse_factor(lexer);
+
+ } else if (token.typ == TOKEN_EXCLAMATION) {
+ lexer_next(lexer, &token);
+ expr = node_new(AST_NOT);
+ expr.d.unary = parse_factor(lexer);
+
+ } else if (token.typ == TOKEN_OPEN_PAREN) {
+ lexer_next(lexer, &token);
+ expr = parse_expression(lexer);
+ lexer_next_assert(lexer, &token, TOKEN_CLOSE_PAREN);
+
+ } else if (is_literal_token(token.typ)) {
+ expr = parse_literal(lexer);
+
+ } else {
+ die_loc2(&token.loc, ": Unexpected token found in parse_factor: ", token_type_to_string(token.typ));
+ }
+ return expr;
+}
+
+// This is absolutely terrible, but I'm not sure how to do it better without macros...
+fn parse_term(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_factor(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_STAR || token.typ == TOKEN_SLASH || token.typ == TOKEN_PERCENT) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_factor(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_additive(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_term(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_PLUS || token.typ == TOKEN_MINUS) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_term(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_relational(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_additive(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_LT || token.typ == TOKEN_LEQ ||
+ token.typ == TOKEN_GT || token.typ == TOKEN_GEQ) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_additive(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_equality(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_relational(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_EQ || token.typ == TOKEN_NEQ) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_relational(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_and(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_equality(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_AMPERSAND) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_equality(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_exclusive_or(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_and(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_CARET) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_and(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_inclusive_or(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_exclusive_or(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_BAR) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_exclusive_or(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_logical_and(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_inclusive_or(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_AND) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_inclusive_or(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_logical_or(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_logical_and(lexer);
+ lexer_peek(lexer, &token);
+ while (token.typ == TOKEN_OR) {
+ lexer_next(lexer, &token);
+ let op = node_new(binary_token_to_op(token.typ));
+ let rhs = parse_logical_and(lexer);
+ op.d.binary.lhs = lhs;
+ op.d.binary.rhs = rhs;
+ lhs = op;
+ lexer_peek(lexer, &token);
+ }
+ return lhs;
+}
+
+fn parse_conditional_exp(lexer: Lexer*): Node* {
+ let token: Token;
+
+ let lhs = parse_logical_or(lexer);
+ lexer_peek(lexer, &token);
+ if (token.typ == TOKEN_QUESTION) {
+ lexer_next(lexer, &token);
+ let then = parse_expression(lexer);
+ lexer_next_assert(lexer, &token, TOKEN_COLON);
+ let els = parse_expression(lexer);
+
+ let cond = node_new(AST_CONDITIONAL);
+ cond.d.conditional.cond = lhs;
+ cond.d.conditional.then = then;
+ cond.d.conditional.els = els;
+
+ lhs = cond;
+ }
+ return lhs;
+}
+
+fn parse_expression(lexer: Lexer*): Node* {
+ return parse_conditional_exp(lexer);
+}
+
+fn parse_var_declaration(lexer: Lexer*): Node* {
+ let token: Token;
+ lexer_next_assert(lexer, &token, TOKEN_LET);
+ lexer_next_assert(lexer, &token, TOKEN_IDENTIFIER);
+ // TODO: check if identifier is already defined
+ let node = node_new(AST_VARDECL);
+ node.d.var_decl.var.name = token.value.as_string;
+
+ lexer_peek(lexer, &token);
+ let has_type = false;
+ if (token.typ == TOKEN_COLON) {
+ lexer_next(lexer, &token);
+ has_type = true;
+ node.d.var_decl.var.typ = parse_type(lexer);
+ lexer_peek(lexer, &token);
+ }
+
+ if (token.typ == TOKEN_ASSIGN) {
+ lexer_next(lexer, &token);
+ node.d.var_decl.init = parse_expression(lexer);
+ } else if (!has_type) {
+ die_loc(&token.loc, "Expected ':' or '=' after variable declaration");
+ }
+
+ return node;
+}
+
+fn parse_function_params(lexer: Lexer*, func: Node*) {
+ let token: Token;
+ lexer_peek(lexer, &token);
+ // TODO: Actually parse params
+ while (token.typ != TOKEN_CLOSE_PAREN) {
+ lexer_next(lexer, &token);
+ }
+}
+
+fn parse_block(lexer: Lexer*): Node*;
+
+
+fn parse_statement(lexer: Lexer*): Node* {
+ let node: Node*;
+ let token: Token;
+
+ lexer_peek(lexer, &token);
+ if (token.typ == TOKEN_OPEN_BRACE) {
+ node = parse_block(lexer);
+
+ } else if (token.typ == TOKEN_RETURN) {
+ lexer_next(lexer, &token);
+ node = node_new(AST_RETURN);
+
+ lexer_peek(lexer, &token);
+ if (token.typ != TOKEN_SEMICOLON) {
+ node.d.unary = parse_expression(lexer);
+ } else {
+ node.d.unary = null; // empty return statment
+ }
+ lexer_next_assert(lexer, &token, TOKEN_SEMICOLON);
+
+ } else if (token.typ == TOKEN_IF) {
+ lexer_next(lexer, &token);
+
+ node = node_new(AST_IF);
+
+ lexer_next_assert(lexer, &token, TOKEN_OPEN_PAREN);
+ node.d.conditional.cond = parse_expression(lexer);
+ lexer_next_assert(lexer, &token, TOKEN_CLOSE_PAREN);
+ node.d.conditional.then = parse_statement(lexer);
+
+ lexer_peek(lexer, &token);
+ if (token.typ == TOKEN_ELSE) {
+ lexer_next(lexer, &token);
+ node.d.conditional.els = parse_statement(lexer);
+ }
+ } else if (token.typ == TOKEN_WHILE) {
+ die("while is not implemented yet");
+ } else if (token.typ == TOKEN_FOR) {
+ die("for is not implemented yet");
+ } else if (token.typ == TOKEN_DEFER) {
+ die("defer is not implemented yet");
+ } else if (token.typ == TOKEN_LET) {
+ node = parse_var_declaration(lexer);
+ lexer_next_assert(lexer, &token, TOKEN_SEMICOLON);
+ } else {
+ // Default to expression statement
+ node = parse_expression(lexer);
+ lexer_next_assert(lexer, &token, TOKEN_SEMICOLON);
+ }
+ return node;
+}
+
+fn parse_block(lexer: Lexer*): Node* {
+ let token: Token;
+ lexer_next_assert(lexer, &token, TOKEN_OPEN_BRACE);
+
+ let block = node_new(AST_BLOCK);
+ block.d.block.children = vector_new();
+
+ lexer_peek(lexer, &token);
+ while (token.typ != TOKEN_CLOSE_BRACE) {
+ block_add_child(block, parse_statement(lexer));
+ lexer_peek(lexer, &token);
+ }
+ lexer_next_assert(lexer, &token, TOKEN_CLOSE_BRACE);
+ return block;
+}
+
+fn parse_function(lexer: Lexer*): Node* {
+ let token: Token;
+
+ lexer_next_assert(lexer, &token, TOKEN_FN);
+ lexer_next_assert(lexer, &token, TOKEN_IDENTIFIER);
+ // TODO: Check if identifier exists
+ let node = node_new(AST_FUNC);
+ node.d.func.name = token.value.as_string;
+
+ lexer_next_assert(lexer, &token, TOKEN_OPEN_PAREN);
+ parse_function_params(lexer, node);
+ lexer_next_assert(lexer, &token, TOKEN_CLOSE_PAREN);
+
+ lexer_peek(lexer, &token);
+ if (token.typ == TOKEN_COLON) {
+ lexer_next(lexer, &token);
+ node.etyp = parse_type(lexer);
+ } else {
+ node.etyp = type_new(TYPE_VOID);
+ }
+
+ node.d.func.body = parse_block(lexer);
+ return node;
+}
+
+fn parse_program(lexer: Lexer*): Node* {
+ let node = node_new(AST_PROGRAM);
+ node.d.block.children = vector_new();
+
+ let token: Token;
+ lexer_peek(lexer, &token);
+
+ while (token.typ != TOKEN_EOF) {
+ if (token.typ == TOKEN_FN) {
+ block_add_child(node, parse_function(lexer));
+ } else if (token.typ == TOKEN_LET) {
+ block_add_child(node, parse_var_declaration(lexer));
+ } else if (token.typ == TOKEN_SEMICOLON) {
+ lexer_next(lexer, &token);
+ } else {
+ die_loc2(&token.loc, "unexpected token in parse_program", token_type_to_string(token.typ));
+ }
+
+ lexer_peek(lexer, &token);
+ }
+ return node;
+} \ No newline at end of file
diff --git a/compiler/tokens.cup b/compiler/tokens.cup
new file mode 100644
index 0000000..e991610
--- /dev/null
+++ b/compiler/tokens.cup
@@ -0,0 +1,238 @@
+import "std/common.cup"
+
+enum TokenType {
+ TOKEN_AMPERSAND,
+ TOKEN_AND,
+ TOKEN_ASSIGN,
+ TOKEN_BAR,
+ TOKEN_CARET,
+ TOKEN_CHARLIT,
+ TOKEN_CLOSE_BRACE,
+ TOKEN_CLOSE_BRACKET,
+ TOKEN_CLOSE_PAREN,
+ TOKEN_COLON,
+ TOKEN_COMMA,
+ TOKEN_DOT,
+ TOKEN_EOF,
+ TOKEN_EQ,
+ TOKEN_EXCLAMATION,
+ TOKEN_GEQ,
+ TOKEN_GT,
+ TOKEN_IDENTIFIER,
+ TOKEN_INTLIT,
+ TOKEN_LEQ,
+ TOKEN_LSHIFT,
+ TOKEN_LT,
+ TOKEN_MINUS,
+ TOKEN_MINUSEQUALS,
+ TOKEN_MINUSMINUS,
+ TOKEN_NEQ,
+ TOKEN_OPEN_BRACE,
+ TOKEN_OPEN_BRACKET,
+ TOKEN_OPEN_PAREN,
+ TOKEN_OR,
+ TOKEN_PERCENT,
+ TOKEN_PLUS,
+ TOKEN_PLUSEQUALS,
+ TOKEN_PLUSPLUS,
+ TOKEN_QUESTION,
+ TOKEN_RSHIFT,
+ TOKEN_SEMICOLON,
+ TOKEN_SLASH,
+ TOKEN_STAR,
+ TOKEN_STRINGLIT,
+ TOKEN_TILDE,
+ TOKEN_XOR,
+
+ // Keywords go below:
+ TOKEN__KEYWORD_BEGIN,
+ TOKEN_CHAR,
+ TOKEN_CONST,
+ TOKEN_ENUM,
+ TOKEN_ELSE,
+ TOKEN_DEFER,
+ TOKEN_FN,
+ TOKEN_FOR,
+ TOKEN_IF,
+ TOKEN_INT,
+ TOKEN_LET,
+ TOKEN_RETURN,
+ TOKEN_STRUCT,
+ TOKEN_UNION,
+ TOKEN_VOID,
+ TOKEN_WHILE,
+ TOKEN_IMPORT,
+ TOKEN__KEYWORD_END,
+};
+
+struct Location {
+ filename: char*;
+ line: int;
+ col: int;
+};
+
+struct Token {
+ typ: int;
+ loc: Location;
+ value: union {
+ as_int: int;
+ as_string: char*;
+ as_char: char;
+ };
+};
+
+fn token_type_to_string(typ: int): char* {
+ if (typ == TOKEN_AMPERSAND) return "TOKEN_AMPERSAND";
+ if (typ == TOKEN_AND) return "TOKEN_AND";
+ if (typ == TOKEN_ASSIGN) return "TOKEN_ASSIGN";
+ if (typ == TOKEN_BAR) return "TOKEN_BAR";
+ if (typ == TOKEN_CARET) return "TOKEN_CARET";
+ if (typ == TOKEN_CHARLIT) return "TOKEN_CHARLIT";
+ if (typ == TOKEN_CLOSE_BRACE) return "TOKEN_CLOSE_BRACE";
+ if (typ == TOKEN_CLOSE_BRACKET) return "TOKEN_CLOSE_BRACKET";
+ if (typ == TOKEN_CLOSE_PAREN) return "TOKEN_CLOSE_PAREN";
+ if (typ == TOKEN_COLON) return "TOKEN_COLON";
+ if (typ == TOKEN_COMMA) return "TOKEN_COMMA";
+ if (typ == TOKEN_DOT) return "TOKEN_DOT";
+ if (typ == TOKEN_EOF) return "TOKEN_EOF";
+ if (typ == TOKEN_EQ) return "TOKEN_EQ";
+ if (typ == TOKEN_EXCLAMATION) return "TOKEN_EXCLAMATION";
+ if (typ == TOKEN_GEQ) return "TOKEN_GEQ";
+ if (typ == TOKEN_GT) return "TOKEN_GT";
+ if (typ == TOKEN_IDENTIFIER) return "TOKEN_IDENTIFIER";
+ if (typ == TOKEN_INTLIT) return "TOKEN_INTLIT";
+ if (typ == TOKEN_LEQ) return "TOKEN_LEQ";
+ if (typ == TOKEN_LSHIFT) return "TOKEN_LSHIFT";
+ if (typ == TOKEN_LT) return "TOKEN_LT";
+ if (typ == TOKEN_MINUS) return "TOKEN_MINUS";
+ if (typ == TOKEN_MINUSEQUALS) return "TOKEN_MINUSEQUALS";
+ if (typ == TOKEN_MINUSMINUS) return "TOKEN_MINUSMINUS";
+ if (typ == TOKEN_NEQ) return "TOKEN_NEQ";
+ if (typ == TOKEN_OPEN_BRACE) return "TOKEN_OPEN_BRACE";
+ if (typ == TOKEN_OPEN_BRACKET) return "TOKEN_OPEN_BRACKET";
+ if (typ == TOKEN_OPEN_PAREN) return "TOKEN_OPEN_PAREN";
+ if (typ == TOKEN_OR) return "TOKEN_OR";
+ if (typ == TOKEN_PERCENT) return "TOKEN_PERCENT";
+ if (typ == TOKEN_PLUS) return "TOKEN_PLUS";
+ if (typ == TOKEN_PLUSEQUALS) return "TOKEN_PLUSEQUALS";
+ if (typ == TOKEN_PLUSPLUS) return "TOKEN_PLUSPLUS";
+ if (typ == TOKEN_QUESTION) return "TOKEN_QUESTION";
+ if (typ == TOKEN_RSHIFT) return "TOKEN_RSHIFT";
+ if (typ == TOKEN_SEMICOLON) return "TOKEN_SEMICOLON";
+ if (typ == TOKEN_SLASH) return "TOKEN_SLASH";
+ if (typ == TOKEN_STAR) return "TOKEN_STAR";
+ if (typ == TOKEN_STRINGLIT) return "TOKEN_STRINGLIT";
+ if (typ == TOKEN_TILDE) return "TOKEN_TILDE";
+ if (typ == TOKEN_XOR) return "TOKEN_XOR";
+ if (typ == TOKEN_CHAR) return "TOKEN_CHAR";
+ if (typ == TOKEN_CONST) return "TOKEN_CONST";
+ if (typ == TOKEN_ENUM) return "TOKEN_ENUM";
+ if (typ == TOKEN_ELSE) return "TOKEN_ELSE";
+ if (typ == TOKEN_DEFER) return "TOKEN_DEFER";
+ if (typ == TOKEN_FN) return "TOKEN_FN";
+ if (typ == TOKEN_FOR) return "TOKEN_FOR";
+ if (typ == TOKEN_IF) return "TOKEN_IF";
+ if (typ == TOKEN_INT) return "TOKEN_INT";
+ if (typ == TOKEN_LET) return "TOKEN_LET";
+ if (typ == TOKEN_RETURN) return "TOKEN_RETURN";
+ if (typ == TOKEN_STRUCT) return "TOKEN_STRUCT";
+ if (typ == TOKEN_UNION) return "TOKEN_UNION";
+ if (typ == TOKEN_VOID) return "TOKEN_VOID";
+ if (typ == TOKEN_WHILE) return "TOKEN_WHILE";
+ if (typ == TOKEN_IMPORT) return "TOKEN_IMPORT";
+
+ putsln("\nUnknown token type in token_type_to_string: "); print(typ);
+ exit(1);
+}
+
+fn keyword_to_string(typ: int): char* {
+ if (typ == TOKEN_CHAR) return "char";
+ if (typ == TOKEN_CONST) return "const";
+ if (typ == TOKEN_ENUM) return "enum";
+ if (typ == TOKEN_ELSE) return "else";
+ if (typ == TOKEN_DEFER) return "defer";
+ if (typ == TOKEN_FN) return "fn";
+ if (typ == TOKEN_FOR) return "for";
+ if (typ == TOKEN_IF) return "if";
+ if (typ == TOKEN_INT) return "int";
+ if (typ == TOKEN_LET) return "let";
+ if (typ == TOKEN_RETURN) return "return";
+ if (typ == TOKEN_STRUCT) return "struct";
+ if (typ == TOKEN_UNION) return "union";
+ if (typ == TOKEN_VOID) return "void";
+ if (typ == TOKEN_WHILE) return "while";
+ if (typ == TOKEN_IMPORT) return "import";
+
+ puts("Unknown keyword in keyword_to_string: ");
+ putsln(token_type_to_string(typ));
+ exit(1);
+}
+
+fn location_init(loc: Location*, filename: char*, line: int, col: int) {
+ loc.filename = filename;
+ loc.line = line;
+ loc.col = col;
+}
+
+fn location_print(loc: Location *) {
+ puts(loc.filename);
+ putc(':');
+ putu(loc.line + 1);
+ putc(':');
+ putu(loc.col + 1);
+}
+
+fn die_loc2(loc: Location*, msg1: char *, msg2: char *) {
+ location_print(loc);
+ puts(": ");
+ puts(msg1);
+ putsln(msg2);
+ exit(1);
+}
+
+fn die_loc(loc: Location*, msg: char *) {
+ die_loc2(loc, msg, "");
+}
+
+fn token_from_type(token: Token*, typ: int, loc: Location *) {
+ token.typ = typ;
+}
+
+fn token_from_int(token: Token*, val: int, loc: Location *) {
+ token.typ = TOKEN_INTLIT;
+ token.value.as_int = val;
+ token.loc.filename = loc.filename;
+ token.loc.line = loc.line;
+ token.loc.col = loc.col;
+}
+
+fn token_from_string(token: Token*, str: char *, loc: Location *) {
+ token.typ = TOKEN_STRINGLIT;
+ token.value.as_string = str;
+ token.loc.filename = loc.filename;
+ token.loc.line = loc.line;
+ token.loc.col = loc.col;
+}
+
+fn token_from_char(token: Token*, c: char, loc: Location *) {
+ token.typ = TOKEN_CHARLIT;
+ token.value.as_char = c;
+ token.loc.filename = loc.filename;
+ token.loc.line = loc.line;
+ token.loc.col = loc.col;
+}
+
+fn token_from_identifier(token: Token*, str: char *, loc: Location *) {
+ token.typ = TOKEN_IDENTIFIER;
+ token.value.as_string = str;
+ token.loc.filename = loc.filename;
+ token.loc.line = loc.line;
+ token.loc.col = loc.col;
+}
+
+fn is_literal_token(typ: int): int {
+ if (typ == TOKEN_INTLIT) return true;
+ if (typ == TOKEN_CHARLIT) return true;
+ if (typ == TOKEN_STRINGLIT) return true;
+ return false;
+} \ No newline at end of file
diff --git a/compiler/types.cup b/compiler/types.cup
new file mode 100644
index 0000000..f3c7b38
--- /dev/null
+++ b/compiler/types.cup
@@ -0,0 +1,82 @@
+import "std/common.cup"
+
+enum BaseType {
+ TYPE_VOID,
+ TYPE_ANY,
+
+ TYPE_PTR,
+ TYPE_ARRAY,
+ TYPE_STRUCT,
+ TYPE_UNION,
+
+ TYPE_INT,
+ TYPE_CHAR,
+};
+
+struct Type {
+ typ: int;
+ ptr: Type*;
+ struct_name: char*;
+ size: int;
+ array_size: int;
+ fields: struct {
+ names: char**;
+ types: Type**;
+ num_fields: int;
+ };
+};
+
+fn size_for_base_type(type: int): int {
+ if (type == TYPE_INT) return 8;
+ if (type == TYPE_PTR) return 8;
+ if (type == TYPE_CHAR) return 1;
+ // Need to be initialized explicitly for compound types
+ return 0;
+}
+
+let _type_int: Type* = null;
+let _type_char: Type* = null;
+let _type_void: Type* = null;
+let _type_any: Type* = null;
+
+fn type_new(typ: int): Type* {
+ if (_type_int == null) { _type_int = malloc(sizeof(Type)); _type_int.typ = TYPE_INT; _type_int.size = 8; }
+ if (_type_char == null) { _type_char = malloc(sizeof(Type)); _type_char.typ = TYPE_CHAR; _type_char.size = 1; }
+ if (_type_void == null) { _type_void = malloc(sizeof(Type)); _type_void.typ = TYPE_VOID; _type_void.size = 0; }
+ if (_type_any == null) { _type_any = malloc(sizeof(Type)); _type_any.typ = TYPE_ANY; _type_any.size = 8; }
+
+ if (typ == TYPE_INT) return _type_int;
+ if (typ == TYPE_CHAR) return _type_char;
+ if (typ == TYPE_VOID) return _type_void;
+ if (typ == TYPE_ANY) return _type_any;
+
+ putsln("Allocating Type*");
+
+ let t: Type* = malloc(sizeof(Type));
+ t.typ = typ;
+ t.size = size_for_base_type(typ);
+ return t;
+}
+
+fn type_new_ptr(typ: int): Type* {
+ let ptr = type_new(TYPE_PTR);
+ ptr.ptr = type_new(typ);
+ return ptr;
+}
+
+// This is named differently because it performs an allocation
+fn create_type_string(typ: Type *): char* {
+ let buf: char* = malloc(32);
+ while (typ.typ == TYPE_PTR || typ.typ == TYPE_ARRAY) {
+ strcat(buf, typ.typ == TYPE_PTR ? "*" : "[]");
+ typ = typ.ptr;
+ }
+
+ if (typ.typ == TYPE_INT) strcat(buf, "int");
+ else if (typ.typ == TYPE_CHAR) strcat(buf, "char");
+ else if (typ.typ == TYPE_VOID) strcat(buf, "void");
+ else if (typ.typ == TYPE_ANY) strcat(buf, "any");
+ else die("type_to_string: unknown type");
+
+ return buf;
+} \ No newline at end of file
diff --git a/run.sh2 b/run.sh2
new file mode 100755
index 0000000..0e6ee64
--- /dev/null
+++ b/run.sh2
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# This script does the following:
+# 1. Builds the project
+# 2. Compiles selected file
+# 3. Assembles executable from compiled asm
+# 4. Runs the executable
+# 5. Echoes the output of the executable
+
+if [ -z "$1" ]
+then
+ echo "Usage: $0 <arguments to cupcc>"
+ exit 1
+fi
+
+set -xe
+
+make
+build/cupcc compiler/main.cup -o build/cup.nasm
+make build/cup.out
+build/cup.out "$@"
+make build/host.out
+
+set +e
+
+build/host.out
+
+echo "Exit status: $?" \ No newline at end of file