aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMustafa Quraish <[email protected]>2022-02-02 23:49:46 -0500
committerMustafa Quraish <[email protected]>2022-02-02 23:49:46 -0500
commit3f083b4286d8e2ed990d72f61febb7f5f4f96626 (patch)
tree553680d0ed918853d06e7843f6c40bcb54e911fa /src
parentRemove default initialization to 0 for variable declarations (diff)
downloadcup-3f083b4286d8e2ed990d72f61febb7f5f4f96626.tar.xz
cup-3f083b4286d8e2ed990d72f61febb7f5f4f96626.zip
Add support for `char` type + string/char literals
This commit does a few things in one go: - Add support for a `char` type + some changes to support the new size - Add support for character literals. We need some escaping here to be able to use `\n` and `\0`, etc. - Add support for string literals. These are all stored in the `.data` section. Fortunately NASM already handles the escape characters. - Fix some bugs with code generation, specifically using `movsx` to sign extend the smaller types into 64-bit registers.
Diffstat (limited to 'src')
-rw-r--r--src/ast.h2
-rw-r--r--src/generator.c54
-rw-r--r--src/lexer.c27
-rw-r--r--src/parser.c30
-rw-r--r--src/tokens.c22
-rw-r--r--src/tokens.h5
-rw-r--r--src/types.c28
-rw-r--r--src/types.h6
8 files changed, 153 insertions, 21 deletions
diff --git a/src/ast.h b/src/ast.h
index 2056505..d1e904c 100644
--- a/src/ast.h
+++ b/src/ast.h
@@ -107,6 +107,8 @@ typedef struct ast_node {
Type *type;
union {
int as_int;
+ char as_char;
+ char *as_string;
};
} literal;
diff --git a/src/generator.c b/src/generator.c
index 4814f4f..f422657 100644
--- a/src/generator.c
+++ b/src/generator.c
@@ -24,7 +24,17 @@ void make_syscall(i64 syscall_no, FILE *out) {
fprintf(out, " syscall\n");
}
-char *specifier_for_type(Type *type) {
+static char *subregister_for_type(Type *type) {
+ switch (size_for_type(type)) {
+ case 1: return "al";
+ case 2: return "ax";
+ case 4: return "eax";
+ case 8: return "rax";
+ default: assert(false && "Unreachable");
+ }
+}
+
+static char *specifier_for_type(Type *type) {
switch (size_for_type(type)) {
case 1: return "byte";
case 2: return "word";
@@ -34,6 +44,7 @@ char *specifier_for_type(Type *type) {
}
}
+
void generate_expr_into_rax(Node *expr, FILE *out);
void generate_lvalue_into_rax(Node *node, FILE *out)
@@ -70,21 +81,46 @@ void generate_func_call(Node *node, FILE *out)
fprintf(out, " add rsp, %lld\n", total_size);
}
+char **all_string_literals = NULL;
+i64 num_string_literals = 0;
+
+void generate_literal_into_rax(Node *node, FILE *out)
+{
+ assert(node->type == AST_LITERAL);
+ if (node->expr_type->type == TYPE_INT) {
+ fprintf(out, " mov rax, %d\n", node->literal.as_int);
+ } else if (node->expr_type->type == TYPE_CHAR) {
+ fprintf(out, " mov rax, %d\n", (int)node->literal.as_char);
+ } else if (node->expr_type->type == TYPE_PTR) {
+ // Add string to global string table
+ char *str = node->literal.as_string;
+ // TODO: Use a hash table here
+ all_string_literals = realloc(all_string_literals, sizeof(char *) * (num_string_literals + 1));
+ all_string_literals[num_string_literals] = str;
+ fprintf(out, " mov rax, global_string_%lld\n", num_string_literals);
+ num_string_literals++;
+ } else {
+ assert(false && "Unknown literal type in generate_literal_into_rax");
+ }
+}
+
// The evaluated expression is stored into `rax`
void generate_expr_into_rax(Node *expr, FILE *out)
{
// TODO: Different sized output for different types?
if (expr->type == AST_LITERAL) {
- // TODO: More literal types
- assert(expr->literal.type->type == TYPE_INT);
- fprintf(out, " mov rax, %d\n", expr->literal.as_int);
+ generate_literal_into_rax(expr, out);
} else if (expr->type == AST_FUNCCALL) {
generate_func_call(expr, out);
} else if (is_lvalue(expr->type)) {
generate_lvalue_into_rax(expr, out);
- fprintf(out, " mov rax, [rax]\n");
+ if (size_for_type(expr->expr_type) == 8) {
+ fprintf(out, " mov rax, [rax]\n");
+ } else {
+ fprintf(out, " movsx rax, %s [rax]\n", specifier_for_type(expr->expr_type));
+ }
} else if (expr->type == OP_ADDROF) {
generate_lvalue_into_rax(expr->unary_expr, out);
@@ -95,7 +131,7 @@ void generate_expr_into_rax(Node *expr, FILE *out)
fprintf(out, " push rax\n");
generate_expr_into_rax(expr->assign.value, out);
fprintf(out, " pop rbx\n");
- fprintf(out, " mov %s [rbx], rax\n", specifier_for_type(var->expr_type));
+ fprintf(out, " mov [rbx], %s\n", subregister_for_type(var->expr_type));
} else if (expr->type == OP_NEG) {
generate_expr_into_rax(expr->unary_expr, out);
@@ -420,6 +456,12 @@ void generate_asm(Node *root, FILE *out)
fprintf(out, "section .bss\n");
fprintf(out, " global_vars: resb %lld\n", root->block.locals_size);
+
+ // Global strings
+ fprintf(out, "section .data\n");
+ for (i64 i = 0; i < num_string_literals; i++) {
+ fprintf(out, " global_string_%lld: db `%s`, 0\n", i, all_string_literals[i]);
+ }
}
void generate_builtins(FILE *out)
diff --git a/src/lexer.c b/src/lexer.c
index 3e71343..d1201b2 100644
--- a/src/lexer.c
+++ b/src/lexer.c
@@ -4,6 +4,7 @@
#include <stdlib.h>
#include <stdio.h>
#include "utils.h"
+#include <assert.h>
Lexer *Lexer_new(char *filename, char *src, i64 len)
{
@@ -98,6 +99,18 @@ static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount)
return token;
}
+static char get_escaped(char c) {
+ switch (c)
+ {
+ case 'n': return '\n';
+ case 'r': return '\r';
+ case 't': return '\t';
+ case '\\': return '\\';
+ case '0': return '\0';
+ }
+ assert(false && "Unknown escape sequence");
+}
+
Token Lexer_next(Lexer *lexer)
{
while (lexer->pos < lexer->len) {
@@ -227,6 +240,20 @@ Token Lexer_next(Lexer *lexer)
return token;
}
+ if (lexer->src[lexer->pos] == '\'') {
+ i64 pos = lexer->pos + 1;
+ // TODO: Handle malformed / incomplete literals
+ // TODO: Handle escapes
+ char c = lexer->src[pos];
+ if (c == '\\') {
+ pos++;
+ c = get_escaped(lexer->src[pos]);
+ }
+ Token token = Token_from_char(c, Lexer_loc(lexer));
+ advance(lexer, pos - lexer->pos + 2);
+ return token;
+ }
+
die_location(Lexer_loc(lexer), ": ERROR: Unexpected character '%c'\n", lexer->src[lexer->pos]);
advance(lexer, 1);
diff --git a/src/parser.c b/src/parser.c
index 435191d..3ee8abf 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -74,19 +74,20 @@ Node *builtin_putc;
void initialize_builtins()
{
+ // FIXME: The `TYPE_ANY` is a hack
builtin_print = Node_new(AST_BUILTIN);
builtin_print->func.name = "print";
builtin_print->func.return_type = type_new(TYPE_INT);
builtin_print->func.num_args = 1;
builtin_print->func.args = (Variable *)calloc(sizeof(Variable), 1);
- builtin_print->func.args[0] = (Variable){"val", type_new(TYPE_INT), 0};
+ builtin_print->func.args[0] = (Variable){"val", type_new(TYPE_ANY), 0};
builtin_putc = Node_new(AST_BUILTIN);
builtin_putc->func.name = "putc";
builtin_putc->func.return_type = type_new(TYPE_INT);
builtin_putc->func.num_args = 1;
builtin_putc->func.args = (Variable *)calloc(sizeof(Variable), 2);
- builtin_putc->func.args[0] = (Variable){"arg", type_new(TYPE_INT), 0};
+ builtin_putc->func.args[0] = (Variable){"arg", type_new(TYPE_ANY), 0};
}
Node *find_builtin_function(Token *token)
@@ -184,10 +185,13 @@ Type *parse_type(Lexer *lexer)
Type *type;
Token token = Lexer_peek(lexer);
if (token.type == TOKEN_INT) {
+ Lexer_next(lexer);
type = type_new(TYPE_INT);
+ } else if (token.type == TOKEN_CHAR) {
Lexer_next(lexer);
+ type = type_new(TYPE_CHAR);
} else {
- type = type_new(TYPE_NONE);
+ die_location(token.loc, "Unexpected type found: %s", token_type_to_str(token.type));
}
for (;;) {
@@ -218,10 +222,22 @@ Type *parse_type(Lexer *lexer)
Node *parse_literal(Lexer *lexer)
{
Node *node = Node_new(AST_LITERAL);
- Token token = assert_token(Lexer_next(lexer), TOKEN_INTLIT);
- node->literal.type = type_new(TYPE_INT);
+
+ Token token = Lexer_next(lexer);
+ if (token.type == TOKEN_INTLIT) {
+ node->literal.type = type_new(TYPE_INT);
+ node->literal.as_int = token.value.as_int;
+ } else if (token.type == TOKEN_STRINGLIT) {
+ node->literal.type = type_new(TYPE_PTR);
+ node->literal.type->ptr = type_new(TYPE_CHAR);
+ node->literal.as_string = token.value.as_string;
+ } else if (token.type == TOKEN_CHARLIT) {
+ node->literal.type = type_new(TYPE_CHAR);
+ node->literal.as_char = token.value.as_char;
+ } else {
+ assert(false && "Invalid literal type in parse_literal\n");
+ }
node->expr_type = node->literal.type;
- node->literal.as_int = token.value.as_int;
return node;
}
@@ -380,7 +396,7 @@ Node *parse_factor(Lexer *lexer)
Lexer_next(lexer);
expr = parse_expression(lexer);
assert_token(Lexer_next(lexer), TOKEN_CLOSE_PAREN);
- } else if (token.type == TOKEN_INTLIT) {
+ } else if (is_literal_token(token.type)) {
expr = parse_literal(lexer);
} else if (token.type == TOKEN_IDENTIFIER) {
expr = parse_identifier(lexer);
diff --git a/src/tokens.c b/src/tokens.c
index 2d99f71..c90cfd9 100644
--- a/src/tokens.c
+++ b/src/tokens.c
@@ -28,6 +28,15 @@ Token Token_from_string(char *value, Location loc)
return token;
}
+Token Token_from_char(char value, Location loc)
+{
+ Token token = {0};
+ token.type = TOKEN_CHARLIT;
+ token.value.as_char = value;
+ token.loc = loc;
+ return token;
+}
+
Token Token_from_identifier(char *value, Location loc)
{
Token token = {0};
@@ -68,4 +77,17 @@ void Token_print(FILE *f, Token *token)
}
fprintf(f, "%s", token_type_to_str(token->type));
+}
+
+bool is_literal_token(TokenType type)
+{
+ switch (type)
+ {
+ case TOKEN_INTLIT:
+ case TOKEN_STRINGLIT:
+ case TOKEN_CHARLIT:
+ return true;
+ default:
+ return false;
+ }
} \ No newline at end of file
diff --git a/src/tokens.h b/src/tokens.h
index f076b89..f02cfb0 100644
--- a/src/tokens.h
+++ b/src/tokens.h
@@ -8,6 +8,7 @@
F(TOKEN_AND, "&&") \
F(TOKEN_ASSIGN, "=") \
F(TOKEN_BAR, "|") \
+ F(TOKEN_CHARLIT, "char literal") \
F(TOKEN_CLOSE_BRACE, "}") \
F(TOKEN_CLOSE_BRACKET, "]") \
F(TOKEN_CLOSE_PAREN, ")") \
@@ -45,6 +46,7 @@
F(TOKEN_XOR, "^")
#define ENUM_KEYWORDS(F) \
+ F(TOKEN_CHAR, "char") \
F(TOKEN_ELSE, "else") \
F(TOKEN_DEFER, "defer") \
F(TOKEN_FN, "fn") \
@@ -83,9 +85,12 @@ typedef struct {
char *token_type_to_str(TokenType type);
+bool is_literal_token(TokenType type);
+
Token Token_from_type(TokenType type, Location loc);
Token Token_from_int(i64 value, Location loc);
Token Token_from_string(char *value, Location loc);
+Token Token_from_char(char value, Location loc);
Token Token_from_identifier(char *value, Location loc);
void Token_print(FILE *f, Token *token); \ No newline at end of file
diff --git a/src/types.c b/src/types.c
index 8876b22..9699f20 100644
--- a/src/types.c
+++ b/src/types.c
@@ -12,6 +12,8 @@ bool type_equals(Type *a, Type *b)
return true;
if (a == NULL || b == NULL)
return false;
+ if (a->type == TYPE_ANY || b->type == TYPE_ANY)
+ return true;
return a->type == b->type && type_equals(a->ptr, b->ptr);
}
@@ -21,8 +23,12 @@ i64 size_for_type(Type *type)
{
case TYPE_INT: return 8;
case TYPE_PTR: return 8;
+ case TYPE_CHAR: return 1;
case TYPE_ARRAY: return type->array_size * size_for_type(type->ptr);
- default: assert(false && "Unreachable type");
+ default: {
+ printf("Unknown type: %d\n", type->type);
+ assert(false && "Unreachable type");
+ }
}
}
@@ -31,13 +37,24 @@ Type *type_new(DataType type)
// For the core types, we don't need to allocate any memory, just
// return a pointer to a static instance.
static Type type_int = {.type = TYPE_INT, .ptr = NULL};
+ static Type type_char = {.type = TYPE_CHAR, .ptr = NULL};
+ static Type type_any = {.type = TYPE_ANY, .ptr = NULL};
if (type == TYPE_INT) return &type_int;
+ if (type == TYPE_CHAR) return &type_char;
+ if (type == TYPE_ANY) return &type_any;
Type *self = calloc(sizeof(Type), 1);
self->type = type;
return self;
}
+bool is_string_type(Type *type)
+{
+ return type
+ && type->type == TYPE_PTR
+ && type->ptr->type == TYPE_CHAR;
+}
+
static char *data_type_to_str(DataType type)
{
switch (type)
@@ -46,6 +63,7 @@ static char *data_type_to_str(DataType type)
case TYPE_INT: return "int";
case TYPE_PTR: return "*";
case TYPE_ARRAY: return "array";
+ case TYPE_CHAR: return "char";
default: assert(false && "Unreachable");
}
}
@@ -115,18 +133,14 @@ Node *handle_binary_expr_types(Node *node, Token *token)
// Pointer arithmetic!
Node *mul = Node_new(OP_MUL);
mul->binary.left = node->binary.right;
- mul->binary.right = Node_new(AST_LITERAL);
- mul->binary.right->literal.type = type_new(TYPE_INT);
- mul->binary.right->literal.as_int = size_for_type(left->ptr);
+ mul->binary.right = Node_from_int_literal(size_for_type(left->ptr));
node->binary.right = mul;
} else if (left->type == TYPE_INT && right->type == TYPE_PTR) {
node->expr_type = right;
// Pointer arithmetic!
Node *mul = Node_new(OP_MUL);
mul->binary.left = node->binary.left;
- mul->binary.right = Node_new(AST_LITERAL);
- mul->binary.right->literal.type = type_new(TYPE_INT);
- mul->binary.right->literal.as_int = size_for_type(right->ptr);
+ mul->binary.right = Node_from_int_literal(size_for_type(left->ptr));
node->binary.left = mul;
} else {
die_location(token->loc, "Cannot add non-integer types");
diff --git a/src/types.h b/src/types.h
index 8bbf4e3..7aadd8d 100644
--- a/src/types.h
+++ b/src/types.h
@@ -5,7 +5,9 @@
typedef enum {
TYPE_NONE,
+ TYPE_ANY, // This is a hack for builtins till we can cast types
TYPE_INT,
+ TYPE_CHAR,
TYPE_PTR,
TYPE_ARRAY,
} DataType;
@@ -18,9 +20,11 @@ typedef struct data_type_node {
Type *type_new(DataType type);
i64 size_for_type(Type *type);
-bool type_equals(Type *a, Type *b);
char *type_to_str(Type *type);
+bool type_equals(Type *a, Type *b);
+bool is_string_type(Type *type);
+
// Type checking / casting expressions to right types
typedef struct ast_node Node;