aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMustafa Quraish <[email protected]>2022-01-24 11:41:31 -0500
committerMustafa Quraish <[email protected]>2022-01-24 11:41:31 -0500
commit8fdfe1150a2155da5ea9d458adc347c4a6de5ce1 (patch)
treee0383491fac4c2d9f65799270fba4e596f8852aa
downloadcup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.tar.xz
cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.zip
Initial commit + start of lexer
-rw-r--r--.gitignore5
-rw-r--r--README.md6
-rwxr-xr-xcompile.sh10
-rw-r--r--cup/lexer.c190
-rw-r--r--cup/lexer.h20
-rw-r--r--cup/main.c30
-rw-r--r--cup/tokens.c85
-rw-r--r--cup/tokens.h70
-rw-r--r--cup/utils.c24
-rw-r--r--cup/utils.h6
-rw-r--r--examples/return-0.cup3
11 files changed, 449 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..86a05e7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+cupcc
+*.dSYM
+*.vscode
+*.idea
+*.DS_Store \ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f315b41
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# CUP: C U, Python!
+
+A badly named, in-progress programming language just to learn how these things work.
+Wait, doesn't everyone write a compiler when they're bored?
+
+Why am I writing this in C? Good question. \ No newline at end of file
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000..faeaf52
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CC=gcc
+CFLAGS="-Wall -Wextra -Werror -ggdb3"
+SRCS=cup/*.c
+
+set -xe
+
+$CC $CFLAGS $SRCS -o cupcc
+./cupcc $@
diff --git a/cup/lexer.c b/cup/lexer.c
new file mode 100644
index 0000000..1109294
--- /dev/null
+++ b/cup/lexer.c
@@ -0,0 +1,190 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+Lexer Lexer_new(char *filename, char *src, i64 len)
+{
+ Lexer self = {0};
+ self.src = src;
+ self.len = len;
+ self.filename = filename;
+ return self;
+}
+
+Location Lexer_loc(Lexer *lexer)
+{
+ Location loc = {0};
+ loc.filename = lexer->filename;
+ loc.line = lexer->line;
+ loc.col = lexer->col;
+ return loc;
+}
+
+static void Lexer_skip_whitespace(Lexer *lexer)
+{
+ while (lexer->pos < lexer->len && isspace(lexer->src[lexer->pos])) {
+ if (lexer->src[lexer->pos] == '\n') {
+ lexer->line++;
+ lexer->col = 0;
+ } else {
+ lexer->col++;
+ }
+ lexer->pos++;
+ }
+}
+
+bool Lexer_has_more(Lexer *lexer)
+{
+ Lexer_skip_whitespace(lexer);
+ return lexer->pos < lexer->len;
+}
+
+static bool Lexer_starts_with(Lexer *lexer, char *str)
+{
+ i64 len = strlen(str);
+ if (lexer->len - lexer->pos < len)
+ return false;
+ for (i64 i = 0; i < len; i++)
+ if (lexer->src[lexer->pos + i] != str[i])
+ return false;
+ i64 end_pos = lexer->pos + len;
+ if (end_pos == lexer->len)
+ return true;
+ char end_char = lexer->src[end_pos];
+ return !(isdigit(end_char) || isalpha(end_char) || end_char == '_');
+}
+
+static void advance(Lexer *lexer, i64 amount)
+{
+ lexer->pos += amount;
+ lexer->col += amount;
+}
+
+static char peek(Lexer *lexer, int amount)
+{
+ if (lexer->pos + amount >= lexer->len)
+ return '\0';
+ return lexer->src[lexer->pos + amount];
+}
+
+static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount)
+{
+ Token token = {0};
+ token.type = type;
+ token.loc = Lexer_loc(lexer);
+ advance(lexer, inc_amount);
+ return token;
+}
+
+Token Lexer_get_next_token(Lexer *lexer)
+{
+ while (lexer->pos < lexer->len) {
+ switch (lexer->src[lexer->pos])
+ {
+ case ' ': case '\t': case '\r': advance(lexer, 1); break;
+ case '\n': lexer->line++; lexer->col = 0; lexer->pos ++; break;
+ case '(': return Lexer_make_token(lexer, TOKEN_OPEN_PAREN, 1);
+ case ')': return Lexer_make_token(lexer, TOKEN_CLOSE_PAREN, 1);
+ case '{': return Lexer_make_token(lexer, TOKEN_OPEN_BRACE, 1);
+ case '}': return Lexer_make_token(lexer, TOKEN_CLOSE_BRACE, 1);
+ case ';': return Lexer_make_token(lexer, TOKEN_SEMICOLON, 1);
+ case ':': return Lexer_make_token(lexer, TOKEN_COLON, 1);
+ case '&': return Lexer_make_token(lexer, TOKEN_AMPERSAND, 1);
+
+ case '<': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_LEQ, 2);
+ return Lexer_make_token(lexer, TOKEN_LT, 1);
+ }
+
+ case '>': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_GEQ, 2);
+ return Lexer_make_token(lexer, TOKEN_GT, 1);
+ }
+
+ case '=': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_EQ, 2);
+ return Lexer_make_token(lexer, TOKEN_ASSIGN, 1);
+ }
+
+ case '+': {
+ if (peek(lexer, 1) == '+')
+ return Lexer_make_token(lexer, TOKEN_PLUSPLUS, 2);
+ return Lexer_make_token(lexer, TOKEN_PLUS, 1);
+ }
+
+ case '-': {
+ if (peek(lexer, 1) == '-')
+ return Lexer_make_token(lexer, TOKEN_MINUSMINUS, 2);
+ return Lexer_make_token(lexer, TOKEN_MINUS, 1);
+ }
+
+ case '*': return Lexer_make_token(lexer, TOKEN_STAR, 1);
+ case '/': return Lexer_make_token(lexer, TOKEN_SLASH, 1);
+ case '%': return Lexer_make_token(lexer, TOKEN_PERCENT, 1);
+
+
+ default: {
+ if (Lexer_starts_with(lexer, "fn")) return Lexer_make_token(lexer, TOKEN_FN, 2);
+ if (Lexer_starts_with(lexer, "return")) return Lexer_make_token(lexer, TOKEN_RETURN, 6);
+ if (Lexer_starts_with(lexer, "int")) return Lexer_make_token(lexer, TOKEN_INT, 3);
+
+ if (isdigit(lexer->src[lexer->pos])) {
+ // TODO: Parse hex and octal numbers
+ i64 pos = lexer->pos;
+ while (pos < lexer->len && isdigit(lexer->src[pos]))
+ pos++;
+ Token token = Token_from_int(atoi(lexer->src + lexer->pos), Lexer_loc(lexer));
+ advance(lexer, pos - lexer->pos);
+ return token;
+ }
+
+ if (isalpha(lexer->src[lexer->pos]) || lexer->src[lexer->pos] == '_') {
+ i64 pos = lexer->pos;
+ while (pos < lexer->len && (isalnum(lexer->src[pos]) || lexer->src[pos] == '_'))
+ pos++;
+ char *str = calloc(pos - lexer->pos + 1, 1);
+ strncpy(str, lexer->src + lexer->pos, pos - lexer->pos);
+ Token token = Token_from_identifier(str, Lexer_loc(lexer));
+ advance(lexer, pos - lexer->pos);
+ return token;
+ }
+
+ // TODO: Handle escapes
+ if (lexer->src[lexer->pos] == '"') {
+ i64 pos = lexer->pos + 1;
+ while (pos < lexer->len && lexer->src[pos] != '"')
+ pos++;
+ if (pos == lexer->len) {
+ die_location(Lexer_loc(lexer), ": ERROR: Reached end-of-file while parsing string literal beginning here.\n");
+ }
+ // Careful with indexing here, because we want to skip opening and closing quotes
+ char *str = calloc(pos - lexer->pos, 1);
+ strncpy(str, lexer->src + lexer->pos + 1, pos - lexer->pos - 1);
+ Token token = Token_from_identifier(str, Lexer_loc(lexer));
+ advance(lexer, pos - lexer->pos + 1);
+ return token;
+ }
+
+
+ printf("Shouldn't have gotten here... char is '%c'\n", lexer->src[lexer->pos]);
+ advance(lexer, 1);
+ }
+ }
+ }
+
+ return Token_from_type(TOKEN_EOF, Lexer_loc(lexer));
+}
+
+Token Lexer_peek_next_token(Lexer *lexer)
+{
+ i64 pos = lexer->pos;
+ Token token = Lexer_get_next_token(lexer);
+ lexer->pos = pos;
+ return token;
+} \ No newline at end of file
diff --git a/cup/lexer.h b/cup/lexer.h
new file mode 100644
index 0000000..c1adb85
--- /dev/null
+++ b/cup/lexer.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "tokens.h"
+#include <stdbool.h>
+
+typedef struct {
+ char *src;
+ i64 len;
+ i64 pos;
+
+ char *filename;
+ i64 line;
+ i64 col;
+} Lexer;
+
+Lexer Lexer_new(char *filename, char *src, i64 len);
+Location Lexer_loc(Lexer *lexer);
+
+Token Lexer_get_next_token(Lexer *lexer);
+Token Lexer_peek_next_token(Lexer *lexer); \ No newline at end of file
diff --git a/cup/main.c b/cup/main.c
new file mode 100644
index 0000000..6a521b0
--- /dev/null
+++ b/cup/main.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lexer.h"
+#include "unistd.h"
+
+int main(int argc, char**argv) {
+ char *filename = argc > 1 ? argv[1] : "./examples/return-0.cup";
+
+ // Read entoire file into memory
+ FILE *fp = fopen(filename, "r");
+ fseek(fp, 0, SEEK_END);
+ long fsize = ftell(fp);
+ fseek(fp, 0, SEEK_SET);
+ char *source = malloc(fsize + 1);
+ fread(source, fsize, 1, fp);
+ source[fsize] = 0;
+ fclose(fp);
+
+ // Lexer
+ Lexer lexer = Lexer_new(filename, source, fsize);
+ Token token;
+ while ( (token = Lexer_get_next_token(&lexer)).type != TOKEN_EOF) {
+ Token_print(stdout, &token);
+ printf("\n");
+ }
+
+ free(source);
+ return 0;
+} \ No newline at end of file
diff --git a/cup/tokens.c b/cup/tokens.c
new file mode 100644
index 0000000..95a0700
--- /dev/null
+++ b/cup/tokens.c
@@ -0,0 +1,85 @@
+#include "tokens.h"
+#include <assert.h>
+#include <stdio.h>
+
+Token Token_from_type(TokenType type, Location loc)
+{
+ Token token = {0};
+ token.type = type;
+ token.loc = loc;
+ return token;
+}
+
+Token Token_from_int(i64 value, Location loc)
+{
+ Token token = {0};
+ token.type = TOKEN_INTLIT;
+ token.value.as_int = value;
+ token.loc = loc;
+ return token;
+}
+
+Token Token_from_string(char *value, Location loc)
+{
+ Token token = {0};
+ token.type = TOKEN_STRINGLIT;
+ token.value.as_string = value;
+ token.loc = loc;
+ return token;
+}
+
+Token Token_from_identifier(char *value, Location loc)
+{
+ Token token = {0};
+ token.type = TOKEN_IDENTIFIER;
+ token.value.as_string = value;
+ token.loc = loc;
+ return token;
+}
+
+void Location_print(FILE *f, Location loc)
+{
+ fprintf(f, "%s:%d:%d", loc.filename, loc.line+1, loc.col+1);
+}
+
+void Token_print(FILE *f, Token *token)
+{
+ Location_print(f, token->loc);
+ fprintf(f, ": ");
+ switch (token->type)
+ {
+ case TOKEN_OPEN_PAREN: fprintf(f, "("); break;
+ case TOKEN_CLOSE_PAREN: fprintf(f, ")"); break;
+ case TOKEN_OPEN_BRACE: fprintf(f, "{"); break;
+ case TOKEN_CLOSE_BRACE: fprintf(f, "}"); break;
+
+ case TOKEN_LT: fprintf(f, "<"); break;
+ case TOKEN_GT: fprintf(f, ">"); break;
+ case TOKEN_EQ: fprintf(f, "=="); break;
+ case TOKEN_NEQ: fprintf(f, "!="); break;
+ case TOKEN_LEQ: fprintf(f, "<="); break;
+ case TOKEN_GEQ: fprintf(f, ">="); break;
+
+ case TOKEN_ASSIGN: fprintf(f, "="); break; // =
+ case TOKEN_AMPERSAND: fprintf(f, "&"); break;
+
+ case TOKEN_PLUS: fprintf(f, "+"); break;
+ case TOKEN_MINUS: fprintf(f, "-"); break;
+ case TOKEN_STAR: fprintf(f, "*"); break;
+ case TOKEN_SLASH: fprintf(f, "/"); break;
+ case TOKEN_PERCENT: fprintf(f, "%%"); break;
+ case TOKEN_PLUSPLUS: fprintf(f, "++"); break;
+ case TOKEN_MINUSMINUS: fprintf(f, "--"); break;
+
+ case TOKEN_COLON: fprintf(f, ":"); break;
+ case TOKEN_SEMICOLON: fprintf(f, ";"); break;
+ case TOKEN_FN: fprintf(f, "<fn>"); break;
+ case TOKEN_RETURN: fprintf(f, "<return>"); break;
+ case TOKEN_INT: fprintf(f, "<int>"); break;
+ case TOKEN_EOF: fprintf(f, "<EOF>"); break;
+ case TOKEN_INTLIT: fprintf(f, "%lld", token->value.as_int); break;
+ case TOKEN_STRINGLIT: fprintf(f, "\"%s\"", token->value.as_string); break;
+ case TOKEN_IDENTIFIER: fprintf(f, "%s", token->value.as_string); break;
+ default: assert(false && "Unknown token type");
+ }
+} \ No newline at end of file
diff --git a/cup/tokens.h b/cup/tokens.h
new file mode 100644
index 0000000..d087845
--- /dev/null
+++ b/cup/tokens.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+typedef int64_t i64;
+
+typedef enum {
+ TOKEN_AMPERSAND, // &
+ TOKEN_AND, // &&
+ TOKEN_ASSIGN, // =
+ TOKEN_BAR, // |
+ TOKEN_CLOSE_BRACE, // }
+ TOKEN_CLOSE_PAREN, // )
+ TOKEN_COLON, // :
+ TOKEN_EOF, // EOF
+ TOKEN_EQ, // ==
+ TOKEN_FN, // fn
+ TOKEN_GEQ, // >=
+ TOKEN_GT, // >
+ TOKEN_IDENTIFIER, // identifier
+ TOKEN_INT, // int
+ TOKEN_INTLIT, // integer literal
+ TOKEN_LEQ, // <=
+ TOKEN_LSHIFT, // <<
+ TOKEN_LT, // <
+ TOKEN_MINUS, // -
+ TOKEN_MINUSMINUS, // --
+ TOKEN_NEQ, // !=
+ TOKEN_OPEN_BRACE, // {
+ TOKEN_OPEN_PAREN, // (
+ TOKEN_OR, // ||
+ TOKEN_PERCENT, // %
+ TOKEN_PLUS, // +
+ TOKEN_PLUSPLUS, // ++
+ TOKEN_QUESTION, // ?
+ TOKEN_RETURN, // return
+ TOKEN_RSHIFT, // >>
+ TOKEN_SEMICOLON, // ;
+ TOKEN_SLASH, // /
+ TOKEN_STAR, // *
+ TOKEN_STRINGLIT, // string literal
+ TOKEN_XOR, // ^
+} TokenType;
+
+typedef struct {
+ char *filename;
+ int line;
+ int col;
+} Location;
+
+void Location_print(FILE *f, Location loc);
+
+typedef struct {
+ TokenType type;
+ Location loc;
+ union tokens {
+ i64 as_int;
+ char *as_string;
+ char as_char;
+ } value;
+} Token;
+
+Token Token_from_type(TokenType type, Location loc);
+Token Token_from_int(i64 value, Location loc);
+Token Token_from_string(char *value, Location loc);
+Token Token_from_identifier(char *value, Location loc);
+
+void Token_print(FILE *f, Token *token); \ No newline at end of file
diff --git a/cup/utils.c b/cup/utils.c
new file mode 100644
index 0000000..3a586cd
--- /dev/null
+++ b/cup/utils.c
@@ -0,0 +1,24 @@
+#include "utils.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void die(const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+ exit(1);
+}
+
+void die_location(Location loc, const char *fmt, ...)
+{
+ Location_print(stderr, loc);
+ va_list args;
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+ exit(1);
+} \ No newline at end of file
diff --git a/cup/utils.h b/cup/utils.h
new file mode 100644
index 0000000..df6f56a
--- /dev/null
+++ b/cup/utils.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "tokens.h"
+
+void die(const char *fmt, ...);
+void die_location(Location loc, const char *fmt, ...); \ No newline at end of file
diff --git a/examples/return-0.cup b/examples/return-0.cup
new file mode 100644
index 0000000..5cc8b30
--- /dev/null
+++ b/examples/return-0.cup
@@ -0,0 +1,3 @@
+fn main(): int {
+ return 0;
+} \ No newline at end of file