Initial commit + start of lexer

author: Mustafa Quraish <[email protected]> 2022-01-24 11:41:31 -0500
committer: Mustafa Quraish <[email protected]> 2022-01-24 11:41:31 -0500
commit: 8fdfe1150a2155da5ea9d458adc347c4a6de5ce1 (patch)
tree: e0383491fac4c2d9f65799270fba4e596f8852aa
download: cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.tar.xz
cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.zip
11 files changed, 449 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..86a05e7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+cupcc
+*.dSYM
+*.vscode
+*.idea
+*.DS_Store
+\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f315b41
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# CUP: C U, Python!
+
+A badly named, in-progress programming language just to learn how these things work.
+Wait, doesn't everyone write a compiler when they're bored?
+
+Why am I writing this in C? Good question.
+\ No newline at end of file
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000..faeaf52
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CC=gcc
+CFLAGS="-Wall -Wextra -Werror -ggdb3"
+SRCS=cup/*.c
+
+set -xe
+
+$CC $CFLAGS $SRCS -o cupcc
+./cupcc $@
diff --git a/cup/lexer.c b/cup/lexer.c
new file mode 100644
index 0000000..1109294
--- /dev/null
+++ b/cup/lexer.c
@@ -0,0 +1,190 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+Lexer Lexer_new(char *filename, char *src, i64 len)
+{
+    Lexer self = {0};
+    self.src = src;
+    self.len = len;
+    self.filename = filename;
+    return self;
+}
+
+Location Lexer_loc(Lexer *lexer)
+{
+    Location loc = {0};
+    loc.filename = lexer->filename;
+    loc.line = lexer->line;
+    loc.col = lexer->col;
+    return loc;
+}
+
+static void Lexer_skip_whitespace(Lexer *lexer)
+{
+    while (lexer->pos < lexer->len && isspace(lexer->src[lexer->pos])) {
+        if (lexer->src[lexer->pos] == '\n') {
+            lexer->line++;
+            lexer->col = 0;
+        } else {
+            lexer->col++;
+        }
+        lexer->pos++;
+    }
+}
+
+bool Lexer_has_more(Lexer *lexer)
+{
+    Lexer_skip_whitespace(lexer);
+    return lexer->pos < lexer->len;
+}
+
+static bool Lexer_starts_with(Lexer *lexer, char *str)
+{
+    i64 len = strlen(str);
+    if (lexer->len - lexer->pos < len)
+        return false;
+    for (i64 i = 0; i < len; i++)
+        if (lexer->src[lexer->pos + i] != str[i])
+            return false;
+    i64 end_pos = lexer->pos + len;
+    if (end_pos == lexer->len)
+        return true;
+    char end_char = lexer->src[end_pos];
+    return !(isdigit(end_char) || isalpha(end_char) || end_char == '_');
+}
+
+static void advance(Lexer *lexer, i64 amount)
+{
+    lexer->pos += amount;
+    lexer->col += amount;
+}
+
+static char peek(Lexer *lexer, int amount)
+{
+    if (lexer->pos + amount >= lexer->len)
+        return '\0';
+    return lexer->src[lexer->pos + amount];
+}
+
+static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount)
+{
+    Token token = {0};
+    token.type = type;
+    token.loc = Lexer_loc(lexer);
+    advance(lexer, inc_amount);
+    return token;
+}
+
+Token Lexer_get_next_token(Lexer *lexer)
+{
+    while (lexer->pos < lexer->len) {
+        switch (lexer->src[lexer->pos])
+        {
+        case ' ': case '\t': case '\r': advance(lexer, 1); break;
+        case '\n': lexer->line++; lexer->col = 0; lexer->pos ++; break;
+        case '(': return Lexer_make_token(lexer, TOKEN_OPEN_PAREN, 1);
+        case ')': return Lexer_make_token(lexer, TOKEN_CLOSE_PAREN, 1);
+        case '{': return Lexer_make_token(lexer, TOKEN_OPEN_BRACE, 1);
+        case '}': return Lexer_make_token(lexer, TOKEN_CLOSE_BRACE, 1);
+        case ';': return Lexer_make_token(lexer, TOKEN_SEMICOLON, 1);
+        case ':': return Lexer_make_token(lexer, TOKEN_COLON, 1);
+        case '&': return Lexer_make_token(lexer, TOKEN_AMPERSAND, 1);
+
+        case '<': {
+            if (peek(lexer, 1) == '=')
+                return Lexer_make_token(lexer, TOKEN_LEQ, 2);
+            return Lexer_make_token(lexer, TOKEN_LT, 1);
+        }
+
+        case '>': {
+            if (peek(lexer, 1) == '=')
+                return Lexer_make_token(lexer, TOKEN_GEQ, 2);
+            return Lexer_make_token(lexer, TOKEN_GT, 1);
+        }
+
+        case '=': {
+            if (peek(lexer, 1) == '=')
+                return Lexer_make_token(lexer, TOKEN_EQ, 2);
+            return Lexer_make_token(lexer, TOKEN_ASSIGN, 1);
+        }
+
+        case '+': {
+            if (peek(lexer, 1) == '+')
+                return Lexer_make_token(lexer, TOKEN_PLUSPLUS, 2);
+            return Lexer_make_token(lexer, TOKEN_PLUS, 1);
+        }
+
+        case '-': {
+            if (peek(lexer, 1) == '-')
+                return Lexer_make_token(lexer, TOKEN_MINUSMINUS, 2);
+            return Lexer_make_token(lexer, TOKEN_MINUS, 1);
+        }
+
+        case '*': return Lexer_make_token(lexer, TOKEN_STAR, 1);
+        case '/': return Lexer_make_token(lexer, TOKEN_SLASH, 1);
+        case '%': return Lexer_make_token(lexer, TOKEN_PERCENT, 1);
+
+
+        default: {
+            if (Lexer_starts_with(lexer, "fn"))     return Lexer_make_token(lexer, TOKEN_FN,     2);
+            if (Lexer_starts_with(lexer, "return")) return Lexer_make_token(lexer, TOKEN_RETURN, 6);
+            if (Lexer_starts_with(lexer, "int"))    return Lexer_make_token(lexer, TOKEN_INT,    3);
+
+            if (isdigit(lexer->src[lexer->pos])) {
+                // TODO: Parse hex and octal numbers
+                i64 pos = lexer->pos;
+                while (pos < lexer->len && isdigit(lexer->src[pos])) 
+                    pos++;
+                Token token = Token_from_int(atoi(lexer->src + lexer->pos), Lexer_loc(lexer));
+                advance(lexer, pos - lexer->pos);
+                return token;
+            }
+
+            if (isalpha(lexer->src[lexer->pos]) || lexer->src[lexer->pos] == '_') {
+                i64 pos = lexer->pos;
+                while (pos < lexer->len && (isalnum(lexer->src[pos]) || lexer->src[pos] == '_'))
+                    pos++;
+                char *str = calloc(pos - lexer->pos + 1, 1);
+                strncpy(str, lexer->src + lexer->pos, pos - lexer->pos);
+                Token token = Token_from_identifier(str, Lexer_loc(lexer));
+                advance(lexer, pos - lexer->pos);
+                return token;
+            }
+
+            // TODO: Handle escapes
+            if (lexer->src[lexer->pos] == '"') {
+                i64 pos = lexer->pos + 1;
+                while (pos < lexer->len && lexer->src[pos] != '"')
+                    pos++;
+                if (pos == lexer->len) {
+                    die_location(Lexer_loc(lexer), ": ERROR: Reached end-of-file while parsing string literal beginning here.\n");
+                }
+                // Careful with indexing here, because we want to skip opening and closing quotes
+                char *str = calloc(pos - lexer->pos, 1);
+                strncpy(str, lexer->src + lexer->pos + 1, pos - lexer->pos - 1);
+                Token token = Token_from_identifier(str, Lexer_loc(lexer));
+                advance(lexer, pos - lexer->pos + 1);
+                return token;
+            }
+
+
+            printf("Shouldn't have gotten here... char is '%c'\n", lexer->src[lexer->pos]);
+            advance(lexer, 1);
+        }
+        }
+    }
+
+    return Token_from_type(TOKEN_EOF, Lexer_loc(lexer));
+}
+
+Token Lexer_peek_next_token(Lexer *lexer)
+{
+    i64 pos = lexer->pos;
+    Token token = Lexer_get_next_token(lexer);
+    lexer->pos = pos;
+    return token;
+}
+\ No newline at end of file
diff --git a/cup/lexer.h b/cup/lexer.h
new file mode 100644
index 0000000..c1adb85
--- /dev/null
+++ b/cup/lexer.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "tokens.h"
+#include <stdbool.h>
+
+typedef struct {
+    char *src;
+    i64 len;
+    i64 pos;
+
+    char *filename;
+    i64 line;
+    i64 col;
+} Lexer;
+
+Lexer Lexer_new(char *filename, char *src, i64 len);
+Location Lexer_loc(Lexer *lexer);
+
+Token Lexer_get_next_token(Lexer *lexer);
+Token Lexer_peek_next_token(Lexer *lexer);
+\ No newline at end of file
diff --git a/cup/main.c b/cup/main.c
new file mode 100644
index 0000000..6a521b0
--- /dev/null
+++ b/cup/main.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "lexer.h"
+#include "unistd.h"
+
+int main(int argc, char**argv) {
+    char *filename = argc > 1 ? argv[1] : "./examples/return-0.cup";
+
+    // Read entoire file into memory
+    FILE *fp = fopen(filename, "r");
+    fseek(fp, 0, SEEK_END);
+    long fsize = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+    char *source = malloc(fsize + 1);
+    fread(source, fsize, 1, fp);
+    source[fsize] = 0;
+    fclose(fp);
+
+    // Lexer
+    Lexer lexer = Lexer_new(filename, source, fsize);
+    Token token;
+    while ( (token = Lexer_get_next_token(&lexer)).type != TOKEN_EOF) {
+        Token_print(stdout, &token);
+        printf("\n");
+    }
+    
+    free(source);
+    return 0;
+}
+\ No newline at end of file
diff --git a/cup/tokens.c b/cup/tokens.c
new file mode 100644
index 0000000..95a0700
--- /dev/null
+++ b/cup/tokens.c
@@ -0,0 +1,85 @@
+#include "tokens.h"
+#include <assert.h>
+#include <stdio.h>
+
+Token Token_from_type(TokenType type, Location loc)
+{
+    Token token = {0};
+    token.type = type;
+    token.loc = loc;
+    return token;
+}
+
+Token Token_from_int(i64 value, Location loc)
+{
+    Token token = {0};
+    token.type = TOKEN_INTLIT;
+    token.value.as_int = value;
+    token.loc = loc;
+    return token;
+}
+
+Token Token_from_string(char *value, Location loc)
+{
+    Token token = {0};
+    token.type = TOKEN_STRINGLIT;
+    token.value.as_string = value;
+    token.loc = loc;
+    return token;
+}
+
+Token Token_from_identifier(char *value, Location loc)
+{
+    Token token = {0};
+    token.type = TOKEN_IDENTIFIER;
+    token.value.as_string = value;
+    token.loc = loc;
+    return token;
+}
+
+void Location_print(FILE *f, Location loc)
+{
+    fprintf(f, "%s:%d:%d", loc.filename, loc.line+1, loc.col+1);
+}
+
+void Token_print(FILE *f, Token *token)
+{
+    Location_print(f, token->loc);
+    fprintf(f, ": ");
+    switch (token->type)
+    {
+        case TOKEN_OPEN_PAREN: fprintf(f, "("); break;
+        case TOKEN_CLOSE_PAREN: fprintf(f, ")"); break;
+        case TOKEN_OPEN_BRACE: fprintf(f, "{"); break;
+        case TOKEN_CLOSE_BRACE: fprintf(f, "}"); break;
+
+        case TOKEN_LT: fprintf(f, "<"); break;
+        case TOKEN_GT: fprintf(f, ">"); break;
+        case TOKEN_EQ: fprintf(f, "=="); break;
+        case TOKEN_NEQ: fprintf(f, "!="); break;
+        case TOKEN_LEQ: fprintf(f, "<="); break;
+        case TOKEN_GEQ: fprintf(f, ">="); break;
+
+        case TOKEN_ASSIGN: fprintf(f, "="); break; // =
+        case TOKEN_AMPERSAND: fprintf(f, "&"); break;
+
+        case TOKEN_PLUS: fprintf(f, "+"); break;
+        case TOKEN_MINUS: fprintf(f, "-"); break;
+        case TOKEN_STAR: fprintf(f, "*"); break;
+        case TOKEN_SLASH: fprintf(f, "/"); break;
+        case TOKEN_PERCENT: fprintf(f, "%%"); break;
+        case TOKEN_PLUSPLUS: fprintf(f, "++"); break;
+        case TOKEN_MINUSMINUS: fprintf(f, "--"); break;
+
+        case TOKEN_COLON: fprintf(f, ":"); break;
+        case TOKEN_SEMICOLON: fprintf(f, ";"); break;
+        case TOKEN_FN: fprintf(f, "<fn>"); break;
+        case TOKEN_RETURN: fprintf(f, "<return>"); break;
+        case TOKEN_INT: fprintf(f, "<int>"); break;
+        case TOKEN_EOF: fprintf(f, "<EOF>"); break;
+        case TOKEN_INTLIT: fprintf(f, "%lld", token->value.as_int); break;
+        case TOKEN_STRINGLIT: fprintf(f, "\"%s\"", token->value.as_string); break;
+        case TOKEN_IDENTIFIER: fprintf(f, "%s", token->value.as_string); break;
+        default: assert(false && "Unknown token type");
+    }
+}
+\ No newline at end of file
diff --git a/cup/tokens.h b/cup/tokens.h
new file mode 100644
index 0000000..d087845
--- /dev/null
+++ b/cup/tokens.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+typedef int64_t i64;
+
+typedef enum {
+    TOKEN_AMPERSAND,    // &
+    TOKEN_AND,          // &&
+    TOKEN_ASSIGN,       // =
+    TOKEN_BAR,          // |
+    TOKEN_CLOSE_BRACE,  // }
+    TOKEN_CLOSE_PAREN,  // )
+    TOKEN_COLON,        // :
+    TOKEN_EOF,          // EOF
+    TOKEN_EQ,           // ==
+    TOKEN_FN,           // fn
+    TOKEN_GEQ,          // >=
+    TOKEN_GT,           // >
+    TOKEN_IDENTIFIER,   // identifier
+    TOKEN_INT,          // int
+    TOKEN_INTLIT,       // integer literal
+    TOKEN_LEQ,          // <=
+    TOKEN_LSHIFT,       // <<
+    TOKEN_LT,           // <
+    TOKEN_MINUS,        // -
+    TOKEN_MINUSMINUS,   // --
+    TOKEN_NEQ,          // !=
+    TOKEN_OPEN_BRACE,   // {
+    TOKEN_OPEN_PAREN,   // (
+    TOKEN_OR,           // ||
+    TOKEN_PERCENT,      // %
+    TOKEN_PLUS,         // +
+    TOKEN_PLUSPLUS,     // ++
+    TOKEN_QUESTION,     // ?
+    TOKEN_RETURN,       // return
+    TOKEN_RSHIFT,       // >>
+    TOKEN_SEMICOLON,    // ;
+    TOKEN_SLASH,        // /
+    TOKEN_STAR,         // *
+    TOKEN_STRINGLIT,    // string literal
+    TOKEN_XOR,          // ^
+} TokenType;
+
+typedef struct {
+    char *filename;
+    int line;
+    int col;
+} Location;
+
+void Location_print(FILE *f, Location loc);
+
+typedef struct {
+    TokenType type;
+    Location loc;
+    union tokens {
+        i64 as_int;
+        char *as_string;
+        char as_char;
+    } value;
+} Token;
+
+Token Token_from_type(TokenType type, Location loc);
+Token Token_from_int(i64 value, Location loc);
+Token Token_from_string(char *value, Location loc);
+Token Token_from_identifier(char *value, Location loc);
+
+void Token_print(FILE *f, Token *token);
+\ No newline at end of file
diff --git a/cup/utils.c b/cup/utils.c
new file mode 100644
index 0000000..3a586cd
--- /dev/null
+++ b/cup/utils.c
@@ -0,0 +1,24 @@
+#include "utils.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void die(const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+    exit(1);
+}
+
+void die_location(Location loc, const char *fmt, ...)
+{
+    Location_print(stderr, loc);
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+    exit(1);
+}
+\ No newline at end of file
diff --git a/cup/utils.h b/cup/utils.h
new file mode 100644
index 0000000..df6f56a
--- /dev/null
+++ b/cup/utils.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "tokens.h"
+
+void die(const char *fmt, ...);
+void die_location(Location loc, const char *fmt, ...);
+\ No newline at end of file
diff --git a/examples/return-0.cup b/examples/return-0.cup
new file mode 100644
index 0000000..5cc8b30
--- /dev/null
+++ b/examples/return-0.cup
@@ -0,0 +1,3 @@
+fn main(): int {
+    return 0;
+}
+\ No newline at end of file
author	Mustafa Quraish <[email protected]>	2022-01-24 11:41:31 -0500
committer	Mustafa Quraish <[email protected]>	2022-01-24 11:41:31 -0500
commit	8fdfe1150a2155da5ea9d458adc347c4a6de5ce1 (patch)
tree	e0383491fac4c2d9f65799270fba4e596f8852aa
download	cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.tar.xz cup-8fdfe1150a2155da5ea9d458adc347c4a6de5ce1.zip