diff options
| author | Mustafa Quraish <[email protected]> | 2022-02-02 23:49:46 -0500 |
|---|---|---|
| committer | Mustafa Quraish <[email protected]> | 2022-02-02 23:49:46 -0500 |
| commit | 3f083b4286d8e2ed990d72f61febb7f5f4f96626 (patch) | |
| tree | 553680d0ed918853d06e7843f6c40bcb54e911fa | |
| parent | Remove default initialization to 0 for variable declarations (diff) | |
| download | cup-3f083b4286d8e2ed990d72f61febb7f5f4f96626.tar.xz cup-3f083b4286d8e2ed990d72f61febb7f5f4f96626.zip | |
Add support for `char` type + string/char literals
This commit does a few things in one go:
- Add support for a `char` type + some changes to support the new size
- Add support for character literals. We need some escaping here to be
able to use `\n` and `\0`, etc.
- Add support for string literals. These are all stored in the `.data`
section. Fortunately NASM already handles the escape characters.
- Fix some bugs with code generation, specifically using `movsx` to sign
extend the smaller types into 64-bit registers.
| -rw-r--r-- | src/ast.h | 2 | ||||
| -rw-r--r-- | src/generator.c | 54 | ||||
| -rw-r--r-- | src/lexer.c | 27 | ||||
| -rw-r--r-- | src/parser.c | 30 | ||||
| -rw-r--r-- | src/tokens.c | 22 | ||||
| -rw-r--r-- | src/tokens.h | 5 | ||||
| -rw-r--r-- | src/types.c | 28 | ||||
| -rw-r--r-- | src/types.h | 6 |
8 files changed, 153 insertions, 21 deletions
@@ -107,6 +107,8 @@ typedef struct ast_node { Type *type; union { int as_int; + char as_char; + char *as_string; }; } literal; diff --git a/src/generator.c b/src/generator.c index 4814f4f..f422657 100644 --- a/src/generator.c +++ b/src/generator.c @@ -24,7 +24,17 @@ void make_syscall(i64 syscall_no, FILE *out) { fprintf(out, " syscall\n"); } -char *specifier_for_type(Type *type) { +static char *subregister_for_type(Type *type) { + switch (size_for_type(type)) { + case 1: return "al"; + case 2: return "ax"; + case 4: return "eax"; + case 8: return "rax"; + default: assert(false && "Unreachable"); + } +} + +static char *specifier_for_type(Type *type) { switch (size_for_type(type)) { case 1: return "byte"; case 2: return "word"; @@ -34,6 +44,7 @@ char *specifier_for_type(Type *type) { } } + void generate_expr_into_rax(Node *expr, FILE *out); void generate_lvalue_into_rax(Node *node, FILE *out) @@ -70,21 +81,46 @@ void generate_func_call(Node *node, FILE *out) fprintf(out, " add rsp, %lld\n", total_size); } +char **all_string_literals = NULL; +i64 num_string_literals = 0; + +void generate_literal_into_rax(Node *node, FILE *out) +{ + assert(node->type == AST_LITERAL); + if (node->expr_type->type == TYPE_INT) { + fprintf(out, " mov rax, %d\n", node->literal.as_int); + } else if (node->expr_type->type == TYPE_CHAR) { + fprintf(out, " mov rax, %d\n", (int)node->literal.as_char); + } else if (node->expr_type->type == TYPE_PTR) { + // Add string to global string table + char *str = node->literal.as_string; + // TODO: Use a hash table here + all_string_literals = realloc(all_string_literals, sizeof(char *) * (num_string_literals + 1)); + all_string_literals[num_string_literals] = str; + fprintf(out, " mov rax, global_string_%lld\n", num_string_literals); + num_string_literals++; + } else { + assert(false && "Unknown literal type in generate_literal_into_rax"); + } +} + // The evaluated expression is stored into `rax` void generate_expr_into_rax(Node *expr, FILE *out) { // TODO: Different sized output for different types? if (expr->type == AST_LITERAL) { - // TODO: More literal types - assert(expr->literal.type->type == TYPE_INT); - fprintf(out, " mov rax, %d\n", expr->literal.as_int); + generate_literal_into_rax(expr, out); } else if (expr->type == AST_FUNCCALL) { generate_func_call(expr, out); } else if (is_lvalue(expr->type)) { generate_lvalue_into_rax(expr, out); - fprintf(out, " mov rax, [rax]\n"); + if (size_for_type(expr->expr_type) == 8) { + fprintf(out, " mov rax, [rax]\n"); + } else { + fprintf(out, " movsx rax, %s [rax]\n", specifier_for_type(expr->expr_type)); + } } else if (expr->type == OP_ADDROF) { generate_lvalue_into_rax(expr->unary_expr, out); @@ -95,7 +131,7 @@ void generate_expr_into_rax(Node *expr, FILE *out) fprintf(out, " push rax\n"); generate_expr_into_rax(expr->assign.value, out); fprintf(out, " pop rbx\n"); - fprintf(out, " mov %s [rbx], rax\n", specifier_for_type(var->expr_type)); + fprintf(out, " mov [rbx], %s\n", subregister_for_type(var->expr_type)); } else if (expr->type == OP_NEG) { generate_expr_into_rax(expr->unary_expr, out); @@ -420,6 +456,12 @@ void generate_asm(Node *root, FILE *out) fprintf(out, "section .bss\n"); fprintf(out, " global_vars: resb %lld\n", root->block.locals_size); + + // Global strings + fprintf(out, "section .data\n"); + for (i64 i = 0; i < num_string_literals; i++) { + fprintf(out, " global_string_%lld: db `%s`, 0\n", i, all_string_literals[i]); + } } void generate_builtins(FILE *out) diff --git a/src/lexer.c b/src/lexer.c index 3e71343..d1201b2 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <stdio.h> #include "utils.h" +#include <assert.h> Lexer *Lexer_new(char *filename, char *src, i64 len) { @@ -98,6 +99,18 @@ static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount) return token; } +static char get_escaped(char c) { + switch (c) + { + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case '\\': return '\\'; + case '0': return '\0'; + } + assert(false && "Unknown escape sequence"); +} + Token Lexer_next(Lexer *lexer) { while (lexer->pos < lexer->len) { @@ -227,6 +240,20 @@ Token Lexer_next(Lexer *lexer) return token; } + if (lexer->src[lexer->pos] == '\'') { + i64 pos = lexer->pos + 1; + // TODO: Handle malformed / incomplete literals + // TODO: Handle escapes + char c = lexer->src[pos]; + if (c == '\\') { + pos++; + c = get_escaped(lexer->src[pos]); + } + Token token = Token_from_char(c, Lexer_loc(lexer)); + advance(lexer, pos - lexer->pos + 2); + return token; + } + die_location(Lexer_loc(lexer), ": ERROR: Unexpected character '%c'\n", lexer->src[lexer->pos]); advance(lexer, 1); diff --git a/src/parser.c b/src/parser.c index 435191d..3ee8abf 100644 --- a/src/parser.c +++ b/src/parser.c @@ -74,19 +74,20 @@ Node *builtin_putc; void initialize_builtins() { + // FIXME: The `TYPE_ANY` is a hack builtin_print = Node_new(AST_BUILTIN); builtin_print->func.name = "print"; builtin_print->func.return_type = type_new(TYPE_INT); builtin_print->func.num_args = 1; builtin_print->func.args = (Variable *)calloc(sizeof(Variable), 1); - builtin_print->func.args[0] = (Variable){"val", type_new(TYPE_INT), 0}; + builtin_print->func.args[0] = (Variable){"val", type_new(TYPE_ANY), 0}; builtin_putc = Node_new(AST_BUILTIN); builtin_putc->func.name = "putc"; builtin_putc->func.return_type = type_new(TYPE_INT); builtin_putc->func.num_args = 1; builtin_putc->func.args = (Variable *)calloc(sizeof(Variable), 2); - builtin_putc->func.args[0] = (Variable){"arg", type_new(TYPE_INT), 0}; + builtin_putc->func.args[0] = (Variable){"arg", type_new(TYPE_ANY), 0}; } Node *find_builtin_function(Token *token) @@ -184,10 +185,13 @@ Type *parse_type(Lexer *lexer) Type *type; Token token = Lexer_peek(lexer); if (token.type == TOKEN_INT) { + Lexer_next(lexer); type = type_new(TYPE_INT); + } else if (token.type == TOKEN_CHAR) { Lexer_next(lexer); + type = type_new(TYPE_CHAR); } else { - type = type_new(TYPE_NONE); + die_location(token.loc, "Unexpected type found: %s", token_type_to_str(token.type)); } for (;;) { @@ -218,10 +222,22 @@ Type *parse_type(Lexer *lexer) Node *parse_literal(Lexer *lexer) { Node *node = Node_new(AST_LITERAL); - Token token = assert_token(Lexer_next(lexer), TOKEN_INTLIT); - node->literal.type = type_new(TYPE_INT); + + Token token = Lexer_next(lexer); + if (token.type == TOKEN_INTLIT) { + node->literal.type = type_new(TYPE_INT); + node->literal.as_int = token.value.as_int; + } else if (token.type == TOKEN_STRINGLIT) { + node->literal.type = type_new(TYPE_PTR); + node->literal.type->ptr = type_new(TYPE_CHAR); + node->literal.as_string = token.value.as_string; + } else if (token.type == TOKEN_CHARLIT) { + node->literal.type = type_new(TYPE_CHAR); + node->literal.as_char = token.value.as_char; + } else { + assert(false && "Invalid literal type in parse_literal\n"); + } node->expr_type = node->literal.type; - node->literal.as_int = token.value.as_int; return node; } @@ -380,7 +396,7 @@ Node *parse_factor(Lexer *lexer) Lexer_next(lexer); expr = parse_expression(lexer); assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN); - } else if (token.type == TOKEN_INTLIT) { + } else if (is_literal_token(token.type)) { expr = parse_literal(lexer); } else if (token.type == TOKEN_IDENTIFIER) { expr = parse_identifier(lexer); diff --git a/src/tokens.c b/src/tokens.c index 2d99f71..c90cfd9 100644 --- a/src/tokens.c +++ b/src/tokens.c @@ -28,6 +28,15 @@ Token Token_from_string(char *value, Location loc) return token; } +Token Token_from_char(char value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_CHARLIT; + token.value.as_char = value; + token.loc = loc; + return token; +} + Token Token_from_identifier(char *value, Location loc) { Token token = {0}; @@ -68,4 +77,17 @@ void Token_print(FILE *f, Token *token) } fprintf(f, "%s", token_type_to_str(token->type)); +} + +bool is_literal_token(TokenType type) +{ + switch (type) + { + case TOKEN_INTLIT: + case TOKEN_STRINGLIT: + case TOKEN_CHARLIT: + return true; + default: + return false; + } }
\ No newline at end of file diff --git a/src/tokens.h b/src/tokens.h index f076b89..f02cfb0 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -8,6 +8,7 @@ F(TOKEN_AND, "&&") \ F(TOKEN_ASSIGN, "=") \ F(TOKEN_BAR, "|") \ + F(TOKEN_CHARLIT, "char literal") \ F(TOKEN_CLOSE_BRACE, "}") \ F(TOKEN_CLOSE_BRACKET, "]") \ F(TOKEN_CLOSE_PAREN, ")") \ @@ -45,6 +46,7 @@ F(TOKEN_XOR, "^") #define ENUM_KEYWORDS(F) \ + F(TOKEN_CHAR, "char") \ F(TOKEN_ELSE, "else") \ F(TOKEN_DEFER, "defer") \ F(TOKEN_FN, "fn") \ @@ -83,9 +85,12 @@ typedef struct { char *token_type_to_str(TokenType type); +bool is_literal_token(TokenType type); + Token Token_from_type(TokenType type, Location loc); Token Token_from_int(i64 value, Location loc); Token Token_from_string(char *value, Location loc); +Token Token_from_char(char value, Location loc); Token Token_from_identifier(char *value, Location loc); void Token_print(FILE *f, Token *token);
\ No newline at end of file diff --git a/src/types.c b/src/types.c index 8876b22..9699f20 100644 --- a/src/types.c +++ b/src/types.c @@ -12,6 +12,8 @@ bool type_equals(Type *a, Type *b) return true; if (a == NULL || b == NULL) return false; + if (a->type == TYPE_ANY || b->type == TYPE_ANY) + return true; return a->type == b->type && type_equals(a->ptr, b->ptr); } @@ -21,8 +23,12 @@ i64 size_for_type(Type *type) { case TYPE_INT: return 8; case TYPE_PTR: return 8; + case TYPE_CHAR: return 1; case TYPE_ARRAY: return type->array_size * size_for_type(type->ptr); - default: assert(false && "Unreachable type"); + default: { + printf("Unknown type: %d\n", type->type); + assert(false && "Unreachable type"); + } } } @@ -31,13 +37,24 @@ Type *type_new(DataType type) // For the core types, we don't need to allocate any memory, just // return a pointer to a static instance. static Type type_int = {.type = TYPE_INT, .ptr = NULL}; + static Type type_char = {.type = TYPE_CHAR, .ptr = NULL}; + static Type type_any = {.type = TYPE_ANY, .ptr = NULL}; if (type == TYPE_INT) return &type_int; + if (type == TYPE_CHAR) return &type_char; + if (type == TYPE_ANY) return &type_any; Type *self = calloc(sizeof(Type), 1); self->type = type; return self; } +bool is_string_type(Type *type) +{ + return type + && type->type == TYPE_PTR + && type->ptr->type == TYPE_CHAR; +} + static char *data_type_to_str(DataType type) { switch (type) @@ -46,6 +63,7 @@ static char *data_type_to_str(DataType type) case TYPE_INT: return "int"; case TYPE_PTR: return "*"; case TYPE_ARRAY: return "array"; + case TYPE_CHAR: return "char"; default: assert(false && "Unreachable"); } } @@ -115,18 +133,14 @@ Node *handle_binary_expr_types(Node *node, Token *token) // Pointer arithmetic! Node *mul = Node_new(OP_MUL); mul->binary.left = node->binary.right; - mul->binary.right = Node_new(AST_LITERAL); - mul->binary.right->literal.type = type_new(TYPE_INT); - mul->binary.right->literal.as_int = size_for_type(left->ptr); + mul->binary.right = Node_from_int_literal(size_for_type(left->ptr)); node->binary.right = mul; } else if (left->type == TYPE_INT && right->type == TYPE_PTR) { node->expr_type = right; // Pointer arithmetic! Node *mul = Node_new(OP_MUL); mul->binary.left = node->binary.left; - mul->binary.right = Node_new(AST_LITERAL); - mul->binary.right->literal.type = type_new(TYPE_INT); - mul->binary.right->literal.as_int = size_for_type(right->ptr); + mul->binary.right = Node_from_int_literal(size_for_type(left->ptr)); node->binary.left = mul; } else { die_location(token->loc, "Cannot add non-integer types"); diff --git a/src/types.h b/src/types.h index 8bbf4e3..7aadd8d 100644 --- a/src/types.h +++ b/src/types.h @@ -5,7 +5,9 @@ typedef enum { TYPE_NONE, + TYPE_ANY, // This is a hack for builtins till we can cast types TYPE_INT, + TYPE_CHAR, TYPE_PTR, TYPE_ARRAY, } DataType; @@ -18,9 +20,11 @@ typedef struct data_type_node { Type *type_new(DataType type); i64 size_for_type(Type *type); -bool type_equals(Type *a, Type *b); char *type_to_str(Type *type); +bool type_equals(Type *a, Type *b); +bool is_string_type(Type *type); + // Type checking / casting expressions to right types typedef struct ast_node Node; |