diff options
| author | Mustafa Quraish <[email protected]> | 2022-01-24 11:41:31 -0500 |
|---|---|---|
| committer | Mustafa Quraish <[email protected]> | 2022-01-24 11:41:31 -0500 |
| commit | 8fdfe1150a2155da5ea9d458adc347c4a6de5ce1 (patch) | |
| tree | e0383491fac4c2d9f65799270fba4e596f8852aa | |
| download | cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.tar.xz cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.zip | |
Initial commit + start of lexer
| -rw-r--r-- | .gitignore | 5 | ||||
| -rw-r--r-- | README.md | 6 | ||||
| -rwxr-xr-x | compile.sh | 10 | ||||
| -rw-r--r-- | cup/lexer.c | 190 | ||||
| -rw-r--r-- | cup/lexer.h | 20 | ||||
| -rw-r--r-- | cup/main.c | 30 | ||||
| -rw-r--r-- | cup/tokens.c | 85 | ||||
| -rw-r--r-- | cup/tokens.h | 70 | ||||
| -rw-r--r-- | cup/utils.c | 24 | ||||
| -rw-r--r-- | cup/utils.h | 6 | ||||
| -rw-r--r-- | examples/return-0.cup | 3 |
11 files changed, 449 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..86a05e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +cupcc +*.dSYM +*.vscode +*.idea +*.DS_Store
\ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f315b41 --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# CUP: C U, Python! + +A badly named, in-progress programming language just to learn how these things work. +Wait, doesn't everyone write a compiler when they're bored? + +Why am I writing this in C? Good question.
\ No newline at end of file diff --git a/compile.sh b/compile.sh new file mode 100755 index 0000000..faeaf52 --- /dev/null +++ b/compile.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +CC=gcc +CFLAGS="-Wall -Wextra -Werror -ggdb3" +SRCS=cup/*.c + +set -xe + +$CC $CFLAGS $SRCS -o cupcc +./cupcc $@ diff --git a/cup/lexer.c b/cup/lexer.c new file mode 100644 index 0000000..1109294 --- /dev/null +++ b/cup/lexer.c @@ -0,0 +1,190 @@ +#include "lexer.h" +#include <ctype.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include "utils.h" + +Lexer Lexer_new(char *filename, char *src, i64 len) +{ + Lexer self = {0}; + self.src = src; + self.len = len; + self.filename = filename; + return self; +} + +Location Lexer_loc(Lexer *lexer) +{ + Location loc = {0}; + loc.filename = lexer->filename; + loc.line = lexer->line; + loc.col = lexer->col; + return loc; +} + +static void Lexer_skip_whitespace(Lexer *lexer) +{ + while (lexer->pos < lexer->len && isspace(lexer->src[lexer->pos])) { + if (lexer->src[lexer->pos] == '\n') { + lexer->line++; + lexer->col = 0; + } else { + lexer->col++; + } + lexer->pos++; + } +} + +bool Lexer_has_more(Lexer *lexer) +{ + Lexer_skip_whitespace(lexer); + return lexer->pos < lexer->len; +} + +static bool Lexer_starts_with(Lexer *lexer, char *str) +{ + i64 len = strlen(str); + if (lexer->len - lexer->pos < len) + return false; + for (i64 i = 0; i < len; i++) + if (lexer->src[lexer->pos + i] != str[i]) + return false; + i64 end_pos = lexer->pos + len; + if (end_pos == lexer->len) + return true; + char end_char = lexer->src[end_pos]; + return !(isdigit(end_char) || isalpha(end_char) || end_char == '_'); +} + +static void advance(Lexer *lexer, i64 amount) +{ + lexer->pos += amount; + lexer->col += amount; +} + +static char peek(Lexer *lexer, int amount) +{ + if (lexer->pos + amount >= lexer->len) + return '\0'; + return lexer->src[lexer->pos + amount]; +} + +static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount) +{ + Token token = {0}; + token.type = type; + token.loc = Lexer_loc(lexer); + advance(lexer, inc_amount); + return token; +} + +Token Lexer_get_next_token(Lexer *lexer) +{ + while (lexer->pos < lexer->len) { + switch (lexer->src[lexer->pos]) + { + case ' ': case '\t': case '\r': advance(lexer, 1); break; + case '\n': lexer->line++; lexer->col = 0; lexer->pos ++; break; + case '(': return Lexer_make_token(lexer, TOKEN_OPEN_PAREN, 1); + case ')': return Lexer_make_token(lexer, TOKEN_CLOSE_PAREN, 1); + case '{': return Lexer_make_token(lexer, TOKEN_OPEN_BRACE, 1); + case '}': return Lexer_make_token(lexer, TOKEN_CLOSE_BRACE, 1); + case ';': return Lexer_make_token(lexer, TOKEN_SEMICOLON, 1); + case ':': return Lexer_make_token(lexer, TOKEN_COLON, 1); + case '&': return Lexer_make_token(lexer, TOKEN_AMPERSAND, 1); + + case '<': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_LEQ, 2); + return Lexer_make_token(lexer, TOKEN_LT, 1); + } + + case '>': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_GEQ, 2); + return Lexer_make_token(lexer, TOKEN_GT, 1); + } + + case '=': { + if (peek(lexer, 1) == '=') + return Lexer_make_token(lexer, TOKEN_EQ, 2); + return Lexer_make_token(lexer, TOKEN_ASSIGN, 1); + } + + case '+': { + if (peek(lexer, 1) == '+') + return Lexer_make_token(lexer, TOKEN_PLUSPLUS, 2); + return Lexer_make_token(lexer, TOKEN_PLUS, 1); + } + + case '-': { + if (peek(lexer, 1) == '-') + return Lexer_make_token(lexer, TOKEN_MINUSMINUS, 2); + return Lexer_make_token(lexer, TOKEN_MINUS, 1); + } + + case '*': return Lexer_make_token(lexer, TOKEN_STAR, 1); + case '/': return Lexer_make_token(lexer, TOKEN_SLASH, 1); + case '%': return Lexer_make_token(lexer, TOKEN_PERCENT, 1); + + + default: { + if (Lexer_starts_with(lexer, "fn")) return Lexer_make_token(lexer, TOKEN_FN, 2); + if (Lexer_starts_with(lexer, "return")) return Lexer_make_token(lexer, TOKEN_RETURN, 6); + if (Lexer_starts_with(lexer, "int")) return Lexer_make_token(lexer, TOKEN_INT, 3); + + if (isdigit(lexer->src[lexer->pos])) { + // TODO: Parse hex and octal numbers + i64 pos = lexer->pos; + while (pos < lexer->len && isdigit(lexer->src[pos])) + pos++; + Token token = Token_from_int(atoi(lexer->src + lexer->pos), Lexer_loc(lexer)); + advance(lexer, pos - lexer->pos); + return token; + } + + if (isalpha(lexer->src[lexer->pos]) || lexer->src[lexer->pos] == '_') { + i64 pos = lexer->pos; + while (pos < lexer->len && (isalnum(lexer->src[pos]) || lexer->src[pos] == '_')) + pos++; + char *str = calloc(pos - lexer->pos + 1, 1); + strncpy(str, lexer->src + lexer->pos, pos - lexer->pos); + Token token = Token_from_identifier(str, Lexer_loc(lexer)); + advance(lexer, pos - lexer->pos); + return token; + } + + // TODO: Handle escapes + if (lexer->src[lexer->pos] == '"') { + i64 pos = lexer->pos + 1; + while (pos < lexer->len && lexer->src[pos] != '"') + pos++; + if (pos == lexer->len) { + die_location(Lexer_loc(lexer), ": ERROR: Reached end-of-file while parsing string literal beginning here.\n"); + } + // Careful with indexing here, because we want to skip opening and closing quotes + char *str = calloc(pos - lexer->pos, 1); + strncpy(str, lexer->src + lexer->pos + 1, pos - lexer->pos - 1); + Token token = Token_from_identifier(str, Lexer_loc(lexer)); + advance(lexer, pos - lexer->pos + 1); + return token; + } + + + printf("Shouldn't have gotten here... char is '%c'\n", lexer->src[lexer->pos]); + advance(lexer, 1); + } + } + } + + return Token_from_type(TOKEN_EOF, Lexer_loc(lexer)); +} + +Token Lexer_peek_next_token(Lexer *lexer) +{ + i64 pos = lexer->pos; + Token token = Lexer_get_next_token(lexer); + lexer->pos = pos; + return token; +}
\ No newline at end of file diff --git a/cup/lexer.h b/cup/lexer.h new file mode 100644 index 0000000..c1adb85 --- /dev/null +++ b/cup/lexer.h @@ -0,0 +1,20 @@ +#pragma once + +#include "tokens.h" +#include <stdbool.h> + +typedef struct { + char *src; + i64 len; + i64 pos; + + char *filename; + i64 line; + i64 col; +} Lexer; + +Lexer Lexer_new(char *filename, char *src, i64 len); +Location Lexer_loc(Lexer *lexer); + +Token Lexer_get_next_token(Lexer *lexer); +Token Lexer_peek_next_token(Lexer *lexer);
\ No newline at end of file diff --git a/cup/main.c b/cup/main.c new file mode 100644 index 0000000..6a521b0 --- /dev/null +++ b/cup/main.c @@ -0,0 +1,30 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "lexer.h" +#include "unistd.h" + +int main(int argc, char**argv) { + char *filename = argc > 1 ? argv[1] : "./examples/return-0.cup"; + + // Read entoire file into memory + FILE *fp = fopen(filename, "r"); + fseek(fp, 0, SEEK_END); + long fsize = ftell(fp); + fseek(fp, 0, SEEK_SET); + char *source = malloc(fsize + 1); + fread(source, fsize, 1, fp); + source[fsize] = 0; + fclose(fp); + + // Lexer + Lexer lexer = Lexer_new(filename, source, fsize); + Token token; + while ( (token = Lexer_get_next_token(&lexer)).type != TOKEN_EOF) { + Token_print(stdout, &token); + printf("\n"); + } + + free(source); + return 0; +}
\ No newline at end of file diff --git a/cup/tokens.c b/cup/tokens.c new file mode 100644 index 0000000..95a0700 --- /dev/null +++ b/cup/tokens.c @@ -0,0 +1,85 @@ +#include "tokens.h" +#include <assert.h> +#include <stdio.h> + +Token Token_from_type(TokenType type, Location loc) +{ + Token token = {0}; + token.type = type; + token.loc = loc; + return token; +} + +Token Token_from_int(i64 value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_INTLIT; + token.value.as_int = value; + token.loc = loc; + return token; +} + +Token Token_from_string(char *value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_STRINGLIT; + token.value.as_string = value; + token.loc = loc; + return token; +} + +Token Token_from_identifier(char *value, Location loc) +{ + Token token = {0}; + token.type = TOKEN_IDENTIFIER; + token.value.as_string = value; + token.loc = loc; + return token; +} + +void Location_print(FILE *f, Location loc) +{ + fprintf(f, "%s:%d:%d", loc.filename, loc.line+1, loc.col+1); +} + +void Token_print(FILE *f, Token *token) +{ + Location_print(f, token->loc); + fprintf(f, ": "); + switch (token->type) + { + case TOKEN_OPEN_PAREN: fprintf(f, "("); break; + case TOKEN_CLOSE_PAREN: fprintf(f, ")"); break; + case TOKEN_OPEN_BRACE: fprintf(f, "{"); break; + case TOKEN_CLOSE_BRACE: fprintf(f, "}"); break; + + case TOKEN_LT: fprintf(f, "<"); break; + case TOKEN_GT: fprintf(f, ">"); break; + case TOKEN_EQ: fprintf(f, "=="); break; + case TOKEN_NEQ: fprintf(f, "!="); break; + case TOKEN_LEQ: fprintf(f, "<="); break; + case TOKEN_GEQ: fprintf(f, ">="); break; + + case TOKEN_ASSIGN: fprintf(f, "="); break; // = + case TOKEN_AMPERSAND: fprintf(f, "&"); break; + + case TOKEN_PLUS: fprintf(f, "+"); break; + case TOKEN_MINUS: fprintf(f, "-"); break; + case TOKEN_STAR: fprintf(f, "*"); break; + case TOKEN_SLASH: fprintf(f, "/"); break; + case TOKEN_PERCENT: fprintf(f, "%%"); break; + case TOKEN_PLUSPLUS: fprintf(f, "++"); break; + case TOKEN_MINUSMINUS: fprintf(f, "--"); break; + + case TOKEN_COLON: fprintf(f, ":"); break; + case TOKEN_SEMICOLON: fprintf(f, ";"); break; + case TOKEN_FN: fprintf(f, "<fn>"); break; + case TOKEN_RETURN: fprintf(f, "<return>"); break; + case TOKEN_INT: fprintf(f, "<int>"); break; + case TOKEN_EOF: fprintf(f, "<EOF>"); break; + case TOKEN_INTLIT: fprintf(f, "%lld", token->value.as_int); break; + case TOKEN_STRINGLIT: fprintf(f, "\"%s\"", token->value.as_string); break; + case TOKEN_IDENTIFIER: fprintf(f, "%s", token->value.as_string); break; + default: assert(false && "Unknown token type"); + } +}
\ No newline at end of file diff --git a/cup/tokens.h b/cup/tokens.h new file mode 100644 index 0000000..d087845 --- /dev/null +++ b/cup/tokens.h @@ -0,0 +1,70 @@ +#pragma once + +#include <stdint.h> +#include <stdbool.h> +#include <stdio.h> + +typedef int64_t i64; + +typedef enum { + TOKEN_AMPERSAND, // & + TOKEN_AND, // && + TOKEN_ASSIGN, // = + TOKEN_BAR, // | + TOKEN_CLOSE_BRACE, // } + TOKEN_CLOSE_PAREN, // ) + TOKEN_COLON, // : + TOKEN_EOF, // EOF + TOKEN_EQ, // == + TOKEN_FN, // fn + TOKEN_GEQ, // >= + TOKEN_GT, // > + TOKEN_IDENTIFIER, // identifier + TOKEN_INT, // int + TOKEN_INTLIT, // integer literal + TOKEN_LEQ, // <= + TOKEN_LSHIFT, // << + TOKEN_LT, // < + TOKEN_MINUS, // - + TOKEN_MINUSMINUS, // -- + TOKEN_NEQ, // != + TOKEN_OPEN_BRACE, // { + TOKEN_OPEN_PAREN, // ( + TOKEN_OR, // || + TOKEN_PERCENT, // % + TOKEN_PLUS, // + + TOKEN_PLUSPLUS, // ++ + TOKEN_QUESTION, // ? + TOKEN_RETURN, // return + TOKEN_RSHIFT, // >> + TOKEN_SEMICOLON, // ; + TOKEN_SLASH, // / + TOKEN_STAR, // * + TOKEN_STRINGLIT, // string literal + TOKEN_XOR, // ^ +} TokenType; + +typedef struct { + char *filename; + int line; + int col; +} Location; + +void Location_print(FILE *f, Location loc); + +typedef struct { + TokenType type; + Location loc; + union tokens { + i64 as_int; + char *as_string; + char as_char; + } value; +} Token; + +Token Token_from_type(TokenType type, Location loc); +Token Token_from_int(i64 value, Location loc); +Token Token_from_string(char *value, Location loc); +Token Token_from_identifier(char *value, Location loc); + +void Token_print(FILE *f, Token *token);
\ No newline at end of file diff --git a/cup/utils.c b/cup/utils.c new file mode 100644 index 0000000..3a586cd --- /dev/null +++ b/cup/utils.c @@ -0,0 +1,24 @@ +#include "utils.h" + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +void die(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + exit(1); +} + +void die_location(Location loc, const char *fmt, ...) +{ + Location_print(stderr, loc); + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + exit(1); +}
\ No newline at end of file diff --git a/cup/utils.h b/cup/utils.h new file mode 100644 index 0000000..df6f56a --- /dev/null +++ b/cup/utils.h @@ -0,0 +1,6 @@ +#pragma once + +#include "tokens.h" + +void die(const char *fmt, ...); +void die_location(Location loc, const char *fmt, ...);
\ No newline at end of file diff --git a/examples/return-0.cup b/examples/return-0.cup new file mode 100644 index 0000000..5cc8b30 --- /dev/null +++ b/examples/return-0.cup @@ -0,0 +1,3 @@ +fn main(): int { + return 0; +}
\ No newline at end of file |