diff options
| author | Mustafa Quraish <[email protected]> | 2022-02-05 08:23:14 -0500 |
|---|---|---|
| committer | Mustafa Quraish <[email protected]> | 2022-02-05 08:56:15 -0500 |
| commit | aeaf92127d1c090f9281616e49ad10dda414bd45 (patch) | |
| tree | f85127c08b0caa13b95b3fb80e2996d3b5186434 | |
| parent | Remove old test which disallowed initializing globals (diff) | |
| download | cup-aeaf92127d1c090f9281616e49ad10dda414bd45.tar.xz cup-aeaf92127d1c090f9281616e49ad10dda414bd45.zip | |
Add implementation of self-hosted compiler so far
There's also a `run.sh2` script which does the following:
- Compiles the C compiler `build/cupcc`
- Compiles the self-hosted compiler `build/cup.out` (with `cupcc`)
- Compiles the specified file on CLI with `build/cup.out`
- Runs this exectuable and shows the output
| -rw-r--r-- | compiler/README.md | 3 | ||||
| -rw-r--r-- | compiler/ast.cup | 317 | ||||
| -rw-r--r-- | compiler/codegen.cup | 151 | ||||
| -rw-r--r-- | compiler/lexer.cup | 288 | ||||
| -rw-r--r-- | compiler/main.cup | 34 | ||||
| -rw-r--r-- | compiler/parser.cup | 433 | ||||
| -rw-r--r-- | compiler/tokens.cup | 238 | ||||
| -rw-r--r-- | compiler/types.cup | 82 | ||||
| -rwxr-xr-x | run.sh2 | 28 |
9 files changed, 1574 insertions, 0 deletions
diff --git a/compiler/README.md b/compiler/README.md new file mode 100644 index 0000000..0e58d43 --- /dev/null +++ b/compiler/README.md @@ -0,0 +1,3 @@ +# CUP Compiler in CUP + +This is the beginnings of a CUP compiler written in itself.
\ No newline at end of file diff --git a/compiler/ast.cup b/compiler/ast.cup new file mode 100644 index 0000000..689f7fb --- /dev/null +++ b/compiler/ast.cup @@ -0,0 +1,317 @@ +import "std/vector.cup" +import "compiler/types.cup" + +enum NodeType { + // Unary + AST_NEG, + AST_NOT, + AST_BWINV, + AST_ADDROF, + AST_DEREF, + // Binary + AST_PLUS, + AST_MINUS, + AST_MUL, + AST_DIV, + AST_MOD, + AST_LSHIFT, + AST_RSHIFT, + AST_AND, + AST_BWAND, + AST_OR, + AST_BWOR, + AST_XOR, + // Comparison + AST_EQ, + AST_NEQ, + AST_LT, + AST_LEQ, + AST_GT, + AST_GEQ, + // Misc. + AST_ASSIGN, + AST_MEMBER, + // AST types + AST_LITERAL, + AST_CONSTANT, + AST_FUNCCALL, + AST_CONDITIONAL, + AST_IF, + AST_WHILE, + AST_DEFER, + AST_FOR, + AST_VARDECL, + AST_LOCAL_VAR, + AST_GLOBAL_VAR, + AST_RETURN, + AST_FUNC, + AST_BUILTIN, + AST_PROGRAM, + AST_BLOCK, +}; + +struct Variable { + name: char *; + typ: Type *; + offset: int; +}; + +struct Node { + typ: int; // NodeType + etyp: Type*; // Expression type + + // TODO: Anonymous union members so we can do `Node.binary`, etc. + d: union { + binary: struct { + lhs: Node *; + rhs: Node *; + }; + + unary: Node *; + + func: struct { + name: char *; + body: Node *; + max_locals_size: int; + args: Vector *; // Vector<Variable> + }; + + block: struct { + children: Vector *; // Vector<Node *> + locals: Vector *; // Vector<Variable> + locals_size: int; + }; + + literal: union { + as_int: int; + as_char: char; + as_string: char *; + }; + + var_decl: struct { + var: Variable; + init: Node *; + }; + + assign: struct { + lhs: Node *; + rhs: Node *; + }; + + conditional: struct { + cond: Node *; + then: Node *; + els: Node *; + }; + + // `loop` is keyword in rust, syntax highlighting breaks + looop: struct { + cond: Node *; + body: Node *; + // for loop: + init: Node *; + step: Node *; + }; + + variable: Variable *; + + call: struct { + func: Node *; + args: Vector *; // Vector<Node *> + }; + + member: struct { + obj: Node *; + offset: int; + is_ptr: int; + }; + + constant: struct { + name: char *; + value: Node *; // Must be int literal + }; + }; +}; + +let node_counter = 0; + +fn node_new(typ: int): Node* { + let node: Node* = malloc(sizeof(Node)); + ++node_counter; + node.typ = typ; + return node; +} + +fn node_from_int_literal(val: int): Node* { + let node: Node* = node_new(AST_LITERAL); + node.etyp = type_new(TYPE_INT); + node.d.literal.as_int = val; + return node; +} + +fn block_add_child(block: Node*, child: Node*) { + if (block.d.block.children == null) + block.d.block.children = vector_new(); + vector_push(block.d.block.children, child); +} + +// TODO: Careful here, the input type here is the same as `type_to_string` +fn node_type_to_string(typ: int): char* { + if (typ == AST_NEG) return "AST_NEG"; + if (typ == AST_NOT) return "AST_NOT"; + if (typ == AST_BWINV) return "AST_BWINV"; + if (typ == AST_ADDROF) return "AST_ADDROF"; + if (typ == AST_DEREF) return "AST_DEREF"; + if (typ == AST_PLUS) return "AST_PLUS"; + if (typ == AST_MINUS) return "AST_MINUS"; + if (typ == AST_MUL) return "AST_MUL"; + if (typ == AST_DIV) return "AST_DIV"; + if (typ == AST_MOD) return "AST_MOD"; + if (typ == AST_LSHIFT) return "AST_LSHIFT"; + if (typ == AST_RSHIFT) return "AST_RSHIFT"; + if (typ == AST_AND) return "AST_AND"; + if (typ == AST_BWAND) return "AST_BWAND"; + if (typ == AST_OR) return "AST_OR"; + if (typ == AST_BWOR) return "AST_BWOR"; + if (typ == AST_XOR) return "AST_XOR"; + if (typ == AST_EQ) return "AST_EQ"; + if (typ == AST_NEQ) return "AST_NEQ"; + if (typ == AST_LT) return "AST_LT"; + if (typ == AST_LEQ) return "AST_LEQ"; + if (typ == AST_GT) return "AST_GT"; + if (typ == AST_GEQ) return "AST_GEQ"; + if (typ == AST_ASSIGN) return "AST_ASSIGN"; + if (typ == AST_MEMBER) return "AST_MEMBER"; + if (typ == AST_LITERAL) return "AST_LITERAL"; + if (typ == AST_CONSTANT) return "AST_CONSTANT"; + if (typ == AST_FUNCCALL) return "AST_FUNCCALL"; + if (typ == AST_CONDITIONAL) return "AST_CONDITIONAL"; + if (typ == AST_IF) return "AST_IF"; + if (typ == AST_WHILE) return "AST_WHILE"; + if (typ == AST_DEFER) return "AST_DEFER"; + if (typ == AST_FOR) return "AST_FOR"; + if (typ == AST_VARDECL) return "AST_VARDECL"; + if (typ == AST_LOCAL_VAR) return "AST_LOCAL_VAR"; + if (typ == AST_GLOBAL_VAR) return "AST_GLOBAL_VAR"; + if (typ == AST_RETURN) return "AST_RETURN"; + if (typ == AST_FUNC) return "AST_FUNC"; + if (typ == AST_BUILTIN) return "AST_BUILTIN"; + if (typ == AST_PROGRAM) return "AST_PROGRAM"; + if (typ == AST_BLOCK) return "AST_BLOCK"; + + puts("Unknown node type in node_type_to_string: "); + putu(typ); putc('\n'); + exit(1); +} + +fn is_binary_op(typ: int): int { + if (typ == AST_PLUS) return true; + if (typ == AST_MINUS) return true; + if (typ == AST_MUL) return true; + if (typ == AST_DIV) return true; + if (typ == AST_MOD) return true; + if (typ == AST_LSHIFT) return true; + if (typ == AST_RSHIFT) return true; + if (typ == AST_AND) return true; + if (typ == AST_BWAND) return true; + if (typ == AST_OR) return true; + if (typ == AST_BWOR) return true; + if (typ == AST_XOR) return true; + if (typ == AST_EQ) return true; + if (typ == AST_NEQ) return true; + if (typ == AST_LT) return true; + if (typ == AST_LEQ) return true; + if (typ == AST_GT) return true; + if (typ == AST_GEQ) return true; + return false; +} + +fn is_unary_op(typ: int): int { + if (typ == AST_NEG) return true; + if (typ == AST_NOT) return true; + if (typ == AST_BWINV) return true; + if (typ == AST_ADDROF) return true; + if (typ == AST_DEREF) return true; + return false; +} + +fn is_lvalue(typ: int): int { + if (typ == AST_LOCAL_VAR) return true; + if (typ == AST_GLOBAL_VAR) return true; + if (typ == AST_MEMBER) return true; + if (typ == AST_DEREF) return true; + return false; +} + +fn binary_token_to_op(token_typ: int): int +{ + if (token_typ == TOKEN_PLUS) return AST_PLUS; + if (token_typ == TOKEN_MINUS) return AST_MINUS; + if (token_typ == TOKEN_STAR) return AST_MUL; + if (token_typ == TOKEN_SLASH) return AST_DIV; + if (token_typ == TOKEN_PERCENT) return AST_MOD; + if (token_typ == TOKEN_LSHIFT) return AST_LSHIFT; + if (token_typ == TOKEN_RSHIFT) return AST_RSHIFT; + if (token_typ == TOKEN_AND) return AST_AND; + if (token_typ == TOKEN_OR) return AST_OR; + if (token_typ == TOKEN_XOR) return AST_XOR; + if (token_typ == TOKEN_EQ) return AST_EQ; + if (token_typ == TOKEN_NEQ) return AST_NEQ; + if (token_typ == TOKEN_LT) return AST_LT; + if (token_typ == TOKEN_LEQ) return AST_LEQ; + if (token_typ == TOKEN_GT) return AST_GT; + if (token_typ == TOKEN_GEQ) return AST_GEQ; + if (token_typ == TOKEN_AMPERSAND) return AST_BWAND; + if (token_typ == TOKEN_BAR) return AST_BWOR; + if (token_typ == TOKEN_CARET) return AST_XOR; + + puts("Unknown token in binary_token_to_op: "); + putsln(token_type_to_string(token_typ)); + exit(1); +} + +fn dump_ast(node: Node*, depth: int) { + for (let i = 0; i < 2*depth; ++i) + putc(' '); + if (node.typ == AST_PROGRAM || node.typ == AST_BLOCK) { + putsln(node_type_to_string(node.typ)); + for (let i = 0; i < node.d.block.children.size; ++i) { + dump_ast(node.d.block.children.data[i], depth + 1); + } + } else if (is_binary_op(node.typ)) { + putsln(node_type_to_string(node.typ)); + dump_ast(node.d.binary.lhs, depth + 1); + dump_ast(node.d.binary.rhs, depth + 1); + } else if (is_unary_op(node.typ) || node.typ == AST_RETURN) { + putsln(node_type_to_string(node.typ)); + dump_ast(node.d.unary, depth + 1); + + } else if (node.typ == AST_LITERAL) { + if (node.etyp.typ == TYPE_INT) { + putu(node.d.literal.as_int); putc('\n'); + } else if (node.etyp.typ == TYPE_PTR) { + putc('"'); puts(node.d.literal.as_string); putc('"'); putc('\n'); + } else if (node.etyp.typ == TYPE_CHAR) { + putc('\''); putc(node.d.literal.as_char); putc('\''); putc('\n'); + } else { + die("Unknown literal type in dump_ast"); + } + } else if (node.typ == AST_FUNC) { + puts("func "); puts(node.d.func.name); puts("()\n"); + dump_ast(node.d.func.body, depth + 1); + } else if (node.typ == AST_VARDECL) { + puts("let "); puts(node.d.var_decl.var.name); + if (node.d.var_decl.var.typ == TYPE_PTR) { + puts(": "); + puts(create_type_string(node.d.var_decl.var.typ)); + } + if (node.d.var_decl.init) { + puts(" =\n"); + dump_ast(node.d.var_decl.init, depth + 1); + } else { + putc('\n'); + } + } else { + putsln(node_type_to_string(node.typ)); + } +}
\ No newline at end of file diff --git a/compiler/codegen.cup b/compiler/codegen.cup new file mode 100644 index 0000000..41eea33 --- /dev/null +++ b/compiler/codegen.cup @@ -0,0 +1,151 @@ +import "compiler/ast.cup" +import "std/file.cup" + +let gen_out_file: File*; + +fn emit_asm4(msg1: char*, msg2: char*, msg3: char*, msg4: char*) { + fwrite(gen_out_file, msg1, strlen(msg1)); + fwrite(gen_out_file, msg2, strlen(msg2)); + fwrite(gen_out_file, msg3, strlen(msg3)); + fwrite(gen_out_file, msg4, strlen(msg4)); +} + +fn emit_asm3(msg1: char*, msg2: char*, msg3: char*) { + fwrite(gen_out_file, msg1, strlen(msg1)); + fwrite(gen_out_file, msg2, strlen(msg2)); + fwrite(gen_out_file, msg3, strlen(msg3)); +} + +fn emit_asm2(msg1: char*, msg2: char*) { + fwrite(gen_out_file, msg1, strlen(msg1)); + fwrite(gen_out_file, msg2, strlen(msg2)); +} + +fn emit_asm(msg: char*) { + fwrite(gen_out_file, msg, strlen(msg)); +} + +fn emit_num(num: int) { + fputu(gen_out_file, num); +} + +fn generate_syscall(num: int) { + emit_asm(" mov rax, "); emit_num(num); emit_asm("\n"); + emit_asm(" syscall\n"); +} + +fn generate_expr_into_rax(node: Node*) { + if (node.typ == AST_LITERAL) { + if (node.etyp.typ == TYPE_INT) { + emit_asm(" mov rax, "); emit_num(node.d.literal.as_int); emit_asm("\n"); + } else { + die("Unsupported literal type in generate_expr_into_rax"); + } + } else if (node.typ == AST_PLUS) { + generate_expr_into_rax(node.d.binary.rhs); + emit_asm(" push rax\n"); + generate_expr_into_rax(node.d.binary.lhs); + emit_asm(" pop rbx\n"); + emit_asm(" add rax, rbx\n"); + } else if (node.typ == AST_MINUS) { + generate_expr_into_rax(node.d.binary.rhs); + emit_asm(" push rax\n"); + generate_expr_into_rax(node.d.binary.lhs); + emit_asm(" pop rbx\n"); + emit_asm(" sub rax, rbx\n"); + } else if (node.typ == AST_DIV) { + generate_expr_into_rax(node.d.binary.rhs); + emit_asm(" push rax\n"); + generate_expr_into_rax(node.d.binary.lhs); + emit_asm(" pop rbx\n"); + emit_asm(" cqo\n"); + emit_asm(" idiv rbx\n"); + + } else if (node.typ == AST_MOD) { + generate_expr_into_rax(node.d.binary.rhs); + emit_asm(" push rax\n"); + generate_expr_into_rax(node.d.binary.lhs); + emit_asm(" pop rbx\n"); + emit_asm(" cqo\n"); + emit_asm(" idiv rbx\n"); + emit_asm(" mov rax, rdx\n"); + + } else if (node.typ == AST_MUL) { + generate_expr_into_rax(node.d.binary.rhs); + emit_asm(" push rax\n"); + generate_expr_into_rax(node.d.binary.lhs); + emit_asm(" pop rbx\n"); + emit_asm(" imul rbx\n"); + } +} + +fn generate_statement(node: Node*) { + if (node.typ == AST_RETURN) { + generate_expr_into_rax(node.d.unary); + emit_asm(" mov rsp, rbp\n"); + emit_asm(" pop rbp\n"); + emit_asm(" ret\n"); + } +} + +fn generate_block(node: Node*) { + let n = node.d.block.children.size; + for (let i = 0; i < n; ++i) { + generate_statement(node.d.block.children.data[i]); + } +} + +fn generate_function(node: Node*) { + emit_asm3("global func_", node.d.func.name, "\n"); + emit_asm3("func_", node.d.func.name, ":\n"); + emit_asm(" push rbp\n"); + emit_asm(" mov rbp, rsp\n"); + emit_asm(" sub rsp, "); emit_num(node.d.func.max_locals_size); emit_asm("\n"); + + generate_block(node.d.func.body); + + emit_asm(" mov rsp, rbp\n"); + emit_asm(" pop rbp\n"); + emit_asm(" ret\n"); +} + +fn generate_program(ast: Node*, file: File*) { + gen_out_file = file; + + let n = ast.d.block.children.size; + for (let i = 0; i < n; ++i) { + let node: Node* = ast.d.block.children.data[i]; + if (node.typ == AST_FUNC) { + generate_function(node); + } else { + die("Unknown node type in generate_program"); + } + } + + if (OS_IS_MACOS) { + emit_asm("global _main\n"); + emit_asm("_main:\n"); + // Push argv + emit_asm(" mov rax, rsi\n"); + emit_asm(" push rax\n"); + // Push argc + emit_asm(" mov rax, rdi\n"); + emit_asm(" push rax\n"); + } else { + emit_asm("global _start\n"); + emit_asm("_start:\n"); + + emit_asm(" mov rbp, rsp\n"); + // // Push argv + emit_asm(" mov rax, rbp\n"); + emit_asm(" add rax, 8\n"); + emit_asm(" push rax\n"); + // Push argc + emit_asm(" mov rax, [rbp]\n"); + emit_asm(" push rax\n"); + } + + emit_asm(" call func_main\n"); + emit_asm(" mov rdi, rax\n"); + generate_syscall(SYS_exit); +}
\ No newline at end of file diff --git a/compiler/lexer.cup b/compiler/lexer.cup new file mode 100644 index 0000000..ff22d8f --- /dev/null +++ b/compiler/lexer.cup @@ -0,0 +1,288 @@ +import "compiler/tokens.cup" + +struct Lexer { + src: char*; + len: int; + pos: int; + + filename: char*; + line: int; + col: int; +}; + +fn lexer_new(filename: char*, src: char*, len: int): Lexer* { + let lexer: Lexer* = malloc(sizeof(Lexer)); + lexer.filename = filename; + lexer.src = src; + lexer.len = len; + return lexer; +} + +fn lexer_loc(lexer: Lexer*, loc: Location*) { + loc.filename = lexer.filename; + loc.line = lexer.line; + loc.col = lexer.col; +} + +fn is_space(c: char): int { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; +} + +fn is_digit(c: char): int { + return c >= '0' && c <= '9'; +} + +fn is_alpha(c: char): int { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; +} + +fn is_alnum(c: char): int { + return is_digit(c) || is_alpha(c); +} + +fn lexer_skip_whitespace(lexer: Lexer*) { + while (lexer.pos < lexer.len && is_space(lexer.src[lexer.pos])) { + if (lexer.src[lexer.pos] == '\n') { + lexer.line = lexer.pos + 1; + lexer.col = 0; + } else { + lexer.col = lexer.col + 1; + } + lexer.pos = lexer.pos + 1; + } +} + +fn lexer_starts_with(lexer: Lexer*, str: char*): int { + let len = strlen(str); + if (lexer.len - lexer.pos < len) + return 0; + for (let i = 0; i < len; ++i) + if (lexer.src[lexer.pos + i] != str[i]) + return 0; + let end_pos = lexer.pos + len; + if (end_pos == lexer.len) + return len; + let end_char = lexer.src[end_pos]; + return !(is_digit(end_char) || is_alpha(end_char)); +} + +fn lexer_advance(lexer: Lexer*, n: int) { + lexer.pos = lexer.pos + n; + lexer.col = lexer.col + n; +} + +fn lexer_peek_char(lexer: Lexer*, n: int): char { + if (lexer.pos + n >= lexer.len) + return 0; + return lexer.src[lexer.pos + n]; +} + +fn lexer_make_token(lexer: Lexer*, token: Token*, typ: int, inc: int) { + lexer_loc(lexer, &token.loc); + lexer_advance(lexer, inc); + token.typ = typ; +} + +fn lexer_next(lexer: Lexer*, token: Token*) { + while (lexer.pos < lexer.len) { + putsln("101.1"); + print(lexer.pos); + print(lexer.len); + let c = lexer.src[lexer.pos]; + putc(c); + putc('\n'); + + if (c == '\n') { ++lexer.line; lexer.col = 0; ++lexer.pos; } + else if (is_space(c)) { lexer_advance(lexer, 1); } + else if (c == '(') { return lexer_make_token(lexer, token, TOKEN_OPEN_PAREN, 1); } + else if (c == ')') { return lexer_make_token(lexer, token, TOKEN_CLOSE_PAREN, 1); } + else if (c == '{') { return lexer_make_token(lexer, token, TOKEN_OPEN_BRACE, 1); } + else if (c == '}') { return lexer_make_token(lexer, token, TOKEN_CLOSE_BRACE, 1); } + else if (c == '[') { return lexer_make_token(lexer, token, TOKEN_OPEN_BRACKET, 1); } + else if (c == ']') { return lexer_make_token(lexer, token, TOKEN_CLOSE_BRACKET, 1); } + else if (c == ';') { return lexer_make_token(lexer, token, TOKEN_SEMICOLON, 1); } + else if (c == ':') { return lexer_make_token(lexer, token, TOKEN_COLON, 1); } + else if (c == '~') { return lexer_make_token(lexer, token, TOKEN_TILDE, 1); } + else if (c == '?') { return lexer_make_token(lexer, token, TOKEN_QUESTION, 1); } + else if (c == '^') { return lexer_make_token(lexer, token, TOKEN_CARET, 1); } + else if (c == '.') { return lexer_make_token(lexer, token, TOKEN_DOT, 1); } + else if (c == ',') { return lexer_make_token(lexer, token, TOKEN_COMMA, 1); } + else if (c == '*') { return lexer_make_token(lexer, token, TOKEN_STAR, 1); } + else if (c == '%') { return lexer_make_token(lexer, token, TOKEN_PERCENT, 1); } + + else if (c == '/' && lexer_peek_char(lexer, 1) == '/') { + lexer.pos = lexer.pos + 2; // skip the '//' + while (lexer.pos < lexer.len && lexer.src[lexer.pos] != '\n') + ++lexer.pos; + // Implicit `continue` + } + + // This needs to go after the comment check. + else if (c == '/') { + return lexer_make_token(lexer, token, TOKEN_SLASH, 1); + } + + else if (c == '&') { + if (lexer_peek_char(lexer, 1) == '&') + return lexer_make_token(lexer, token, TOKEN_AND, 2); + return lexer_make_token(lexer, token, TOKEN_AMPERSAND, 1); + } + + else if (c == '!') { + if (lexer_peek_char(lexer, 1) == '=') + return lexer_make_token(lexer, token, TOKEN_NEQ, 2); + return lexer_make_token(lexer, token, TOKEN_EXCLAMATION, 1); + } + + else if (c == '<') { + if (lexer_peek_char(lexer, 1) == '=') + return lexer_make_token(lexer, token, TOKEN_LEQ, 2); + return lexer_make_token(lexer, token, TOKEN_LT, 1); + } + + else if (c == '>') { + if (lexer_peek_char(lexer, 1) == '=') + return lexer_make_token(lexer, token, TOKEN_GEQ, 2); + return lexer_make_token(lexer, token, TOKEN_GT, 1); + } + + else if (c == '=') { + if (lexer_peek_char(lexer, 1) == '=') + return lexer_make_token(lexer, token, TOKEN_EQ, 2); + return lexer_make_token(lexer, token, TOKEN_ASSIGN, 1); + } + + else if (c == '|') { + if (lexer_peek_char(lexer, 1) == '|') + return lexer_make_token(lexer, token, TOKEN_OR, 2); + return lexer_make_token(lexer, token, TOKEN_BAR, 1); + } + + + else if (c == '+') { + if (lexer_peek_char(lexer, 1) == '+') + return lexer_make_token(lexer, token, TOKEN_PLUSPLUS, 2); + if (lexer_peek_char(lexer, 1) == '=') + return lexer_make_token(lexer, token, TOKEN_PLUSEQUALS, 2); + return lexer_make_token(lexer, token, TOKEN_PLUS, 1); + } + + else if (c == '-') { + if (lexer_peek_char(lexer, 1) == '-') + return lexer_make_token(lexer, token, TOKEN_MINUSMINUS, 2); + if (lexer_peek_char(lexer, 1) == '=') + return lexer_make_token(lexer, token, TOKEN_MINUSEQUALS, 2); + return lexer_make_token(lexer, token, TOKEN_MINUS, 1); + } + + else { + // Parse the keywords... + for (let i = TOKEN__KEYWORD_BEGIN+1; i < TOKEN__KEYWORD_END; ++i) { + let str = keyword_to_string(i); + if (lexer_starts_with(lexer, str)) { + return lexer_make_token(lexer, token, i, strlen(str)); + } + } + + // Parse numbers: + if (is_digit(c)) { + // TODO: Parse hex and octal numbers + let pos = lexer.pos; + while (pos < lexer.len && is_digit(lexer.src[pos])) + ++pos; + let loc: Location; + lexer_loc(lexer, &loc); + token_from_int(token, atoi(lexer.src + lexer.pos), &loc); + lexer_advance(lexer, pos - lexer.pos); + return; + } + + // Parse identifiers: + if (is_alpha(lexer.src[lexer.pos])) { + let pos = lexer.pos; + while (pos < lexer.len && is_alnum(lexer.src[pos])) + ++pos; + let str_len = pos - lexer.pos; + let str: char* = malloc(str_len + 1); + memcpy(str, lexer.src + lexer.pos, str_len); + str[str_len] = '\0'; + let loc: Location; + lexer_loc(lexer, &loc); + token_from_identifier(token, str, &loc); + lexer_advance(lexer, str_len); + return; + } + + if (c == '"') { + let pos = lexer.pos + 1; + while (pos < lexer.len && lexer.src[pos] != '"') + ++pos; + + let loc: Location; + lexer_loc(lexer, &loc); + + if (pos == lexer.len) + die_loc(&loc, "EOF while parsing string literal"); + + // Careful with indexing here, because we want to skip opening and closing quotes + let str_len = pos - lexer.pos - 1; + let str: char* = malloc(str_len + 1); + memcpy(str, lexer.src + lexer.pos + 1, str_len); + str[str_len] = '\0'; + token_from_string(token, str, &loc); + lexer_advance(lexer, pos - lexer.pos + 1); + return; + } + + if (c == '\'') { + let pos = lexer.pos + 1; + // TODO: Handle malformed / incomplete literals + // TODO: Handle escapes + c = lexer.src[pos]; + if (c == '\\') { + ++pos; + c = lexer.src[pos]; + if (c == 'n') { c = '\n'; } + else if (c == 't') { c = '\t'; } + else if (c == 'n') { c = '\n'; } + else if (c == 'r') { c = '\r'; } + else if (c == 't') { c = '\t'; } + else if (c == '0') { c = '\0'; } + else { } + // TODO: Handle octal and hex escapes + } + + let loc: Location; + lexer_loc(lexer, &loc); + token_from_char(token, c, &loc); + lexer_advance(lexer, pos - lexer.pos + 2); + return; + } + + puts("Unknown character in lexer_next: '"); putc(c); putsln("'"); + die("Exiting"); + } + } + return lexer_make_token(lexer, token, TOKEN_EOF, 0); +} + +fn lexer_next_assert(lexer: Lexer*, token: Token*, expected: int) { + lexer_next(lexer, token); + if (token.typ != expected) { + location_print(&token.loc); + puts(": Expected "); puts(token_type_to_string(expected)); + puts(" but got "); puts(token_type_to_string(token.typ)); + putc('\n'); + exit(1); + } +} + +fn lexer_peek(lexer: Lexer*, token: Token*) { + let pos = lexer.pos; + let col = lexer.col; + let line = lexer.line; + lexer_next(lexer, token); + lexer.pos = pos; + lexer.col = col; + lexer.line = line; +}
\ No newline at end of file diff --git a/compiler/main.cup b/compiler/main.cup new file mode 100644 index 0000000..a0a3476 --- /dev/null +++ b/compiler/main.cup @@ -0,0 +1,34 @@ +import "std/file.cup" +import "compiler/lexer.cup" +import "compiler/parser.cup" +import "compiler/codegen.cup" + +fn main(argc: int, argv: char **): int { + if (argc != 2) + die("Usage: cupcc <input_file>"); + + let input_file = fopen(argv[1], 'r'); + defer fclose(input_file); + + // using `fmap` here doesn't work on linux, for some reason. + let file_size = fsize(input_file); + let src: char* = malloc(file_size+1); + fread(input_file, src, file_size); + src[file_size] = '\0'; + + let lexer = lexer_new(argv[1], src, file_size); + let ast = parse_program(lexer); + + dump_ast(ast, 0); + + let out_file = fopen("build/host.nasm", 'w'); + defer fclose(out_file); + + generate_program(ast, out_file); + + puts("---------------------------\n"); + + puts("Total amount of memory used by malloc: "); + putu(__malloc_buf_pos); + putsln("\nDone."); +}
\ No newline at end of file diff --git a/compiler/parser.cup b/compiler/parser.cup new file mode 100644 index 0000000..48e4514 --- /dev/null +++ b/compiler/parser.cup @@ -0,0 +1,433 @@ +import "compiler/ast.cup" +import "compiler/lexer.cup" + +// p_ prefix for parser global variables. + +let p_all_functions = vector_new(); + +let p_block_stack = vector_new(); +let p_cur_stack_offset = 0; + +fn parse_literal(lexer: Lexer*): Node* { + let token: Token; + lexer_next(lexer, &token); + let node = node_new(AST_LITERAL); + + if (token.typ == TOKEN_INTLIT) { + node.d.literal.as_int = token.value.as_int; + node.etyp = type_new(TYPE_INT); + } else if (token.typ == TOKEN_STRINGLIT) { + node.d.literal.as_string = token.value.as_string; + node.etyp = type_new_ptr(TYPE_CHAR); + } else if (token.typ == TOKEN_CHARLIT) { + node.d.literal.as_char = token.value.as_char; + node.etyp = type_new(TYPE_CHAR); + } else { + die_loc2(&token.loc, "Unexpected token in parse_literal: ", token_type_to_string(token.typ)); + } + return node; +} + +fn parse_type(lexer: Lexer*): Type* { + let token: Token; + let typ: Type *; + lexer_peek(lexer, &token); + if (token.typ == TOKEN_INT) { + lexer_next(lexer, &token); + typ = type_new(TYPE_INT); + } else if (token.typ == TOKEN_CHAR) { + lexer_next(lexer, &token); + typ = type_new(TYPE_CHAR); + } else if (token.typ == TOKEN_VOID) { + lexer_next(lexer, &token); + typ = type_new(TYPE_VOID); + } + + let running = true; + while (running) { + lexer_peek(lexer, &token); + if (token.typ == TOKEN_STAR) { + lexer_next(lexer, &token); + let ptr = type_new(TYPE_PTR); + ptr.ptr = typ; + typ = ptr; + } else if (token.typ == TOKEN_OPEN_BRACKET) { + die("Array types not yet implemented"); + } else { + running = false; + } + } + return typ; +} + +// pragma region expressions +fn parse_expression(lexer: Lexer*): Node*; + +fn parse_factor(lexer: Lexer*): Node* { + let token: Token; + let expr: Node*; + lexer_peek(lexer, &token); + + if (token.typ == TOKEN_MINUS) { + lexer_next(lexer, &token); + expr = node_new(AST_NEG); + expr.d.unary = parse_factor(lexer); + + } else if (token.typ == TOKEN_TILDE) { + lexer_next(lexer, &token); + expr = node_new(AST_BWINV); + expr.d.unary = parse_factor(lexer); + + } else if (token.typ == TOKEN_EXCLAMATION) { + lexer_next(lexer, &token); + expr = node_new(AST_NOT); + expr.d.unary = parse_factor(lexer); + + } else if (token.typ == TOKEN_OPEN_PAREN) { + lexer_next(lexer, &token); + expr = parse_expression(lexer); + lexer_next_assert(lexer, &token, TOKEN_CLOSE_PAREN); + + } else if (is_literal_token(token.typ)) { + expr = parse_literal(lexer); + + } else { + die_loc2(&token.loc, ": Unexpected token found in parse_factor: ", token_type_to_string(token.typ)); + } + return expr; +} + +// This is absolutely terrible, but I'm not sure how to do it better without macros... +fn parse_term(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_factor(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_STAR || token.typ == TOKEN_SLASH || token.typ == TOKEN_PERCENT) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_factor(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_additive(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_term(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_PLUS || token.typ == TOKEN_MINUS) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_term(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_relational(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_additive(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_LT || token.typ == TOKEN_LEQ || + token.typ == TOKEN_GT || token.typ == TOKEN_GEQ) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_additive(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_equality(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_relational(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_EQ || token.typ == TOKEN_NEQ) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_relational(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_and(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_equality(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_AMPERSAND) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_equality(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_exclusive_or(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_and(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_CARET) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_and(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_inclusive_or(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_exclusive_or(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_BAR) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_exclusive_or(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_logical_and(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_inclusive_or(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_AND) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_inclusive_or(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_logical_or(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_logical_and(lexer); + lexer_peek(lexer, &token); + while (token.typ == TOKEN_OR) { + lexer_next(lexer, &token); + let op = node_new(binary_token_to_op(token.typ)); + let rhs = parse_logical_and(lexer); + op.d.binary.lhs = lhs; + op.d.binary.rhs = rhs; + lhs = op; + lexer_peek(lexer, &token); + } + return lhs; +} + +fn parse_conditional_exp(lexer: Lexer*): Node* { + let token: Token; + + let lhs = parse_logical_or(lexer); + lexer_peek(lexer, &token); + if (token.typ == TOKEN_QUESTION) { + lexer_next(lexer, &token); + let then = parse_expression(lexer); + lexer_next_assert(lexer, &token, TOKEN_COLON); + let els = parse_expression(lexer); + + let cond = node_new(AST_CONDITIONAL); + cond.d.conditional.cond = lhs; + cond.d.conditional.then = then; + cond.d.conditional.els = els; + + lhs = cond; + } + return lhs; +} + +fn parse_expression(lexer: Lexer*): Node* { + return parse_conditional_exp(lexer); +} + +fn parse_var_declaration(lexer: Lexer*): Node* { + let token: Token; + lexer_next_assert(lexer, &token, TOKEN_LET); + lexer_next_assert(lexer, &token, TOKEN_IDENTIFIER); + // TODO: check if identifier is already defined + let node = node_new(AST_VARDECL); + node.d.var_decl.var.name = token.value.as_string; + + lexer_peek(lexer, &token); + let has_type = false; + if (token.typ == TOKEN_COLON) { + lexer_next(lexer, &token); + has_type = true; + node.d.var_decl.var.typ = parse_type(lexer); + lexer_peek(lexer, &token); + } + + if (token.typ == TOKEN_ASSIGN) { + lexer_next(lexer, &token); + node.d.var_decl.init = parse_expression(lexer); + } else if (!has_type) { + die_loc(&token.loc, "Expected ':' or '=' after variable declaration"); + } + + return node; +} + +fn parse_function_params(lexer: Lexer*, func: Node*) { + let token: Token; + lexer_peek(lexer, &token); + // TODO: Actually parse params + while (token.typ != TOKEN_CLOSE_PAREN) { + lexer_next(lexer, &token); + } +} + +fn parse_block(lexer: Lexer*): Node*; + + +fn parse_statement(lexer: Lexer*): Node* { + let node: Node*; + let token: Token; + + lexer_peek(lexer, &token); + if (token.typ == TOKEN_OPEN_BRACE) { + node = parse_block(lexer); + + } else if (token.typ == TOKEN_RETURN) { + lexer_next(lexer, &token); + node = node_new(AST_RETURN); + + lexer_peek(lexer, &token); + if (token.typ != TOKEN_SEMICOLON) { + node.d.unary = parse_expression(lexer); + } else { + node.d.unary = null; // empty return statment + } + lexer_next_assert(lexer, &token, TOKEN_SEMICOLON); + + } else if (token.typ == TOKEN_IF) { + lexer_next(lexer, &token); + + node = node_new(AST_IF); + + lexer_next_assert(lexer, &token, TOKEN_OPEN_PAREN); + node.d.conditional.cond = parse_expression(lexer); + lexer_next_assert(lexer, &token, TOKEN_CLOSE_PAREN); + node.d.conditional.then = parse_statement(lexer); + + lexer_peek(lexer, &token); + if (token.typ == TOKEN_ELSE) { + lexer_next(lexer, &token); + node.d.conditional.els = parse_statement(lexer); + } + } else if (token.typ == TOKEN_WHILE) { + die("while is not implemented yet"); + } else if (token.typ == TOKEN_FOR) { + die("for is not implemented yet"); + } else if (token.typ == TOKEN_DEFER) { + die("defer is not implemented yet"); + } else if (token.typ == TOKEN_LET) { + node = parse_var_declaration(lexer); + lexer_next_assert(lexer, &token, TOKEN_SEMICOLON); + } else { + // Default to expression statement + node = parse_expression(lexer); + lexer_next_assert(lexer, &token, TOKEN_SEMICOLON); + } + return node; +} + +fn parse_block(lexer: Lexer*): Node* { + let token: Token; + lexer_next_assert(lexer, &token, TOKEN_OPEN_BRACE); + + let block = node_new(AST_BLOCK); + block.d.block.children = vector_new(); + + lexer_peek(lexer, &token); + while (token.typ != TOKEN_CLOSE_BRACE) { + block_add_child(block, parse_statement(lexer)); + lexer_peek(lexer, &token); + } + lexer_next_assert(lexer, &token, TOKEN_CLOSE_BRACE); + return block; +} + +fn parse_function(lexer: Lexer*): Node* { + let token: Token; + + lexer_next_assert(lexer, &token, TOKEN_FN); + lexer_next_assert(lexer, &token, TOKEN_IDENTIFIER); + // TODO: Check if identifier exists + let node = node_new(AST_FUNC); + node.d.func.name = token.value.as_string; + + lexer_next_assert(lexer, &token, TOKEN_OPEN_PAREN); + parse_function_params(lexer, node); + lexer_next_assert(lexer, &token, TOKEN_CLOSE_PAREN); + + lexer_peek(lexer, &token); + if (token.typ == TOKEN_COLON) { + lexer_next(lexer, &token); + node.etyp = parse_type(lexer); + } else { + node.etyp = type_new(TYPE_VOID); + } + + node.d.func.body = parse_block(lexer); + return node; +} + +fn parse_program(lexer: Lexer*): Node* { + let node = node_new(AST_PROGRAM); + node.d.block.children = vector_new(); + + let token: Token; + lexer_peek(lexer, &token); + + while (token.typ != TOKEN_EOF) { + if (token.typ == TOKEN_FN) { + block_add_child(node, parse_function(lexer)); + } else if (token.typ == TOKEN_LET) { + block_add_child(node, parse_var_declaration(lexer)); + } else if (token.typ == TOKEN_SEMICOLON) { + lexer_next(lexer, &token); + } else { + die_loc2(&token.loc, "unexpected token in parse_program", token_type_to_string(token.typ)); + } + + lexer_peek(lexer, &token); + } + return node; +}
\ No newline at end of file diff --git a/compiler/tokens.cup b/compiler/tokens.cup new file mode 100644 index 0000000..e991610 --- /dev/null +++ b/compiler/tokens.cup @@ -0,0 +1,238 @@ +import "std/common.cup" + +enum TokenType { + TOKEN_AMPERSAND, + TOKEN_AND, + TOKEN_ASSIGN, + TOKEN_BAR, + TOKEN_CARET, + TOKEN_CHARLIT, + TOKEN_CLOSE_BRACE, + TOKEN_CLOSE_BRACKET, + TOKEN_CLOSE_PAREN, + TOKEN_COLON, + TOKEN_COMMA, + TOKEN_DOT, + TOKEN_EOF, + TOKEN_EQ, + TOKEN_EXCLAMATION, + TOKEN_GEQ, + TOKEN_GT, + TOKEN_IDENTIFIER, + TOKEN_INTLIT, + TOKEN_LEQ, + TOKEN_LSHIFT, + TOKEN_LT, + TOKEN_MINUS, + TOKEN_MINUSEQUALS, + TOKEN_MINUSMINUS, + TOKEN_NEQ, + TOKEN_OPEN_BRACE, + TOKEN_OPEN_BRACKET, + TOKEN_OPEN_PAREN, + TOKEN_OR, + TOKEN_PERCENT, + TOKEN_PLUS, + TOKEN_PLUSEQUALS, + TOKEN_PLUSPLUS, + TOKEN_QUESTION, + TOKEN_RSHIFT, + TOKEN_SEMICOLON, + TOKEN_SLASH, + TOKEN_STAR, + TOKEN_STRINGLIT, + TOKEN_TILDE, + TOKEN_XOR, + + // Keywords go below: + TOKEN__KEYWORD_BEGIN, + TOKEN_CHAR, + TOKEN_CONST, + TOKEN_ENUM, + TOKEN_ELSE, + TOKEN_DEFER, + TOKEN_FN, + TOKEN_FOR, + TOKEN_IF, + TOKEN_INT, + TOKEN_LET, + TOKEN_RETURN, + TOKEN_STRUCT, + TOKEN_UNION, + TOKEN_VOID, + TOKEN_WHILE, + TOKEN_IMPORT, + TOKEN__KEYWORD_END, +}; + +struct Location { + filename: char*; + line: int; + col: int; +}; + +struct Token { + typ: int; + loc: Location; + value: union { + as_int: int; + as_string: char*; + as_char: char; + }; +}; + +fn token_type_to_string(typ: int): char* { + if (typ == TOKEN_AMPERSAND) return "TOKEN_AMPERSAND"; + if (typ == TOKEN_AND) return "TOKEN_AND"; + if (typ == TOKEN_ASSIGN) return "TOKEN_ASSIGN"; + if (typ == TOKEN_BAR) return "TOKEN_BAR"; + if (typ == TOKEN_CARET) return "TOKEN_CARET"; + if (typ == TOKEN_CHARLIT) return "TOKEN_CHARLIT"; + if (typ == TOKEN_CLOSE_BRACE) return "TOKEN_CLOSE_BRACE"; + if (typ == TOKEN_CLOSE_BRACKET) return "TOKEN_CLOSE_BRACKET"; + if (typ == TOKEN_CLOSE_PAREN) return "TOKEN_CLOSE_PAREN"; + if (typ == TOKEN_COLON) return "TOKEN_COLON"; + if (typ == TOKEN_COMMA) return "TOKEN_COMMA"; + if (typ == TOKEN_DOT) return "TOKEN_DOT"; + if (typ == TOKEN_EOF) return "TOKEN_EOF"; + if (typ == TOKEN_EQ) return "TOKEN_EQ"; + if (typ == TOKEN_EXCLAMATION) return "TOKEN_EXCLAMATION"; + if (typ == TOKEN_GEQ) return "TOKEN_GEQ"; + if (typ == TOKEN_GT) return "TOKEN_GT"; + if (typ == TOKEN_IDENTIFIER) return "TOKEN_IDENTIFIER"; + if (typ == TOKEN_INTLIT) return "TOKEN_INTLIT"; + if (typ == TOKEN_LEQ) return "TOKEN_LEQ"; + if (typ == TOKEN_LSHIFT) return "TOKEN_LSHIFT"; + if (typ == TOKEN_LT) return "TOKEN_LT"; + if (typ == TOKEN_MINUS) return "TOKEN_MINUS"; + if (typ == TOKEN_MINUSEQUALS) return "TOKEN_MINUSEQUALS"; + if (typ == TOKEN_MINUSMINUS) return "TOKEN_MINUSMINUS"; + if (typ == TOKEN_NEQ) return "TOKEN_NEQ"; + if (typ == TOKEN_OPEN_BRACE) return "TOKEN_OPEN_BRACE"; + if (typ == TOKEN_OPEN_BRACKET) return "TOKEN_OPEN_BRACKET"; + if (typ == TOKEN_OPEN_PAREN) return "TOKEN_OPEN_PAREN"; + if (typ == TOKEN_OR) return "TOKEN_OR"; + if (typ == TOKEN_PERCENT) return "TOKEN_PERCENT"; + if (typ == TOKEN_PLUS) return "TOKEN_PLUS"; + if (typ == TOKEN_PLUSEQUALS) return "TOKEN_PLUSEQUALS"; + if (typ == TOKEN_PLUSPLUS) return "TOKEN_PLUSPLUS"; + if (typ == TOKEN_QUESTION) return "TOKEN_QUESTION"; + if (typ == TOKEN_RSHIFT) return "TOKEN_RSHIFT"; + if (typ == TOKEN_SEMICOLON) return "TOKEN_SEMICOLON"; + if (typ == TOKEN_SLASH) return "TOKEN_SLASH"; + if (typ == TOKEN_STAR) return "TOKEN_STAR"; + if (typ == TOKEN_STRINGLIT) return "TOKEN_STRINGLIT"; + if (typ == TOKEN_TILDE) return "TOKEN_TILDE"; + if (typ == TOKEN_XOR) return "TOKEN_XOR"; + if (typ == TOKEN_CHAR) return "TOKEN_CHAR"; + if (typ == TOKEN_CONST) return "TOKEN_CONST"; + if (typ == TOKEN_ENUM) return "TOKEN_ENUM"; + if (typ == TOKEN_ELSE) return "TOKEN_ELSE"; + if (typ == TOKEN_DEFER) return "TOKEN_DEFER"; + if (typ == TOKEN_FN) return "TOKEN_FN"; + if (typ == TOKEN_FOR) return "TOKEN_FOR"; + if (typ == TOKEN_IF) return "TOKEN_IF"; + if (typ == TOKEN_INT) return "TOKEN_INT"; + if (typ == TOKEN_LET) return "TOKEN_LET"; + if (typ == TOKEN_RETURN) return "TOKEN_RETURN"; + if (typ == TOKEN_STRUCT) return "TOKEN_STRUCT"; + if (typ == TOKEN_UNION) return "TOKEN_UNION"; + if (typ == TOKEN_VOID) return "TOKEN_VOID"; + if (typ == TOKEN_WHILE) return "TOKEN_WHILE"; + if (typ == TOKEN_IMPORT) return "TOKEN_IMPORT"; + + putsln("\nUnknown token type in token_type_to_string: "); print(typ); + exit(1); +} + +fn keyword_to_string(typ: int): char* { + if (typ == TOKEN_CHAR) return "char"; + if (typ == TOKEN_CONST) return "const"; + if (typ == TOKEN_ENUM) return "enum"; + if (typ == TOKEN_ELSE) return "else"; + if (typ == TOKEN_DEFER) return "defer"; + if (typ == TOKEN_FN) return "fn"; + if (typ == TOKEN_FOR) return "for"; + if (typ == TOKEN_IF) return "if"; + if (typ == TOKEN_INT) return "int"; + if (typ == TOKEN_LET) return "let"; + if (typ == TOKEN_RETURN) return "return"; + if (typ == TOKEN_STRUCT) return "struct"; + if (typ == TOKEN_UNION) return "union"; + if (typ == TOKEN_VOID) return "void"; + if (typ == TOKEN_WHILE) return "while"; + if (typ == TOKEN_IMPORT) return "import"; + + puts("Unknown keyword in keyword_to_string: "); + putsln(token_type_to_string(typ)); + exit(1); +} + +fn location_init(loc: Location*, filename: char*, line: int, col: int) { + loc.filename = filename; + loc.line = line; + loc.col = col; +} + +fn location_print(loc: Location *) { + puts(loc.filename); + putc(':'); + putu(loc.line + 1); + putc(':'); + putu(loc.col + 1); +} + +fn die_loc2(loc: Location*, msg1: char *, msg2: char *) { + location_print(loc); + puts(": "); + puts(msg1); + putsln(msg2); + exit(1); +} + +fn die_loc(loc: Location*, msg: char *) { + die_loc2(loc, msg, ""); +} + +fn token_from_type(token: Token*, typ: int, loc: Location *) { + token.typ = typ; +} + +fn token_from_int(token: Token*, val: int, loc: Location *) { + token.typ = TOKEN_INTLIT; + token.value.as_int = val; + token.loc.filename = loc.filename; + token.loc.line = loc.line; + token.loc.col = loc.col; +} + +fn token_from_string(token: Token*, str: char *, loc: Location *) { + token.typ = TOKEN_STRINGLIT; + token.value.as_string = str; + token.loc.filename = loc.filename; + token.loc.line = loc.line; + token.loc.col = loc.col; +} + +fn token_from_char(token: Token*, c: char, loc: Location *) { + token.typ = TOKEN_CHARLIT; + token.value.as_char = c; + token.loc.filename = loc.filename; + token.loc.line = loc.line; + token.loc.col = loc.col; +} + +fn token_from_identifier(token: Token*, str: char *, loc: Location *) { + token.typ = TOKEN_IDENTIFIER; + token.value.as_string = str; + token.loc.filename = loc.filename; + token.loc.line = loc.line; + token.loc.col = loc.col; +} + +fn is_literal_token(typ: int): int { + if (typ == TOKEN_INTLIT) return true; + if (typ == TOKEN_CHARLIT) return true; + if (typ == TOKEN_STRINGLIT) return true; + return false; +}
\ No newline at end of file diff --git a/compiler/types.cup b/compiler/types.cup new file mode 100644 index 0000000..f3c7b38 --- /dev/null +++ b/compiler/types.cup @@ -0,0 +1,82 @@ +import "std/common.cup" + +enum BaseType { + TYPE_VOID, + TYPE_ANY, + + TYPE_PTR, + TYPE_ARRAY, + TYPE_STRUCT, + TYPE_UNION, + + TYPE_INT, + TYPE_CHAR, +}; + +struct Type { + typ: int; + ptr: Type*; + struct_name: char*; + size: int; + array_size: int; + fields: struct { + names: char**; + types: Type**; + num_fields: int; + }; +}; + +fn size_for_base_type(type: int): int { + if (type == TYPE_INT) return 8; + if (type == TYPE_PTR) return 8; + if (type == TYPE_CHAR) return 1; + // Need to be initialized explicitly for compound types + return 0; +} + +let _type_int: Type* = null; +let _type_char: Type* = null; +let _type_void: Type* = null; +let _type_any: Type* = null; + +fn type_new(typ: int): Type* { + if (_type_int == null) { _type_int = malloc(sizeof(Type)); _type_int.typ = TYPE_INT; _type_int.size = 8; } + if (_type_char == null) { _type_char = malloc(sizeof(Type)); _type_char.typ = TYPE_CHAR; _type_char.size = 1; } + if (_type_void == null) { _type_void = malloc(sizeof(Type)); _type_void.typ = TYPE_VOID; _type_void.size = 0; } + if (_type_any == null) { _type_any = malloc(sizeof(Type)); _type_any.typ = TYPE_ANY; _type_any.size = 8; } + + if (typ == TYPE_INT) return _type_int; + if (typ == TYPE_CHAR) return _type_char; + if (typ == TYPE_VOID) return _type_void; + if (typ == TYPE_ANY) return _type_any; + + putsln("Allocating Type*"); + + let t: Type* = malloc(sizeof(Type)); + t.typ = typ; + t.size = size_for_base_type(typ); + return t; +} + +fn type_new_ptr(typ: int): Type* { + let ptr = type_new(TYPE_PTR); + ptr.ptr = type_new(typ); + return ptr; +} + +// This is named differently because it performs an allocation +fn create_type_string(typ: Type *): char* { + let buf: char* = malloc(32); + while (typ.typ == TYPE_PTR || typ.typ == TYPE_ARRAY) { + strcat(buf, typ.typ == TYPE_PTR ? "*" : "[]"); + typ = typ.ptr; + } + + if (typ.typ == TYPE_INT) strcat(buf, "int"); + else if (typ.typ == TYPE_CHAR) strcat(buf, "char"); + else if (typ.typ == TYPE_VOID) strcat(buf, "void"); + else if (typ.typ == TYPE_ANY) strcat(buf, "any"); + else die("type_to_string: unknown type"); + + return buf; +}
\ No newline at end of file @@ -0,0 +1,28 @@ +#!/bin/bash + +# This script does the following: +# 1. Builds the project +# 2. Compiles selected file +# 3. Assembles executable from compiled asm +# 4. Runs the executable +# 5. Echoes the output of the executable + +if [ -z "$1" ] +then + echo "Usage: $0 <arguments to cupcc>" + exit 1 +fi + +set -xe + +make +build/cupcc compiler/main.cup -o build/cup.nasm +make build/cup.out +build/cup.out "$@" +make build/host.out + +set +e + +build/host.out + +echo "Exit status: $?"
\ No newline at end of file |