aboutsummaryrefslogtreecommitdiff
path: root/src/lexer.c
diff options
context:
space:
mode:
authorMustafa Quraish <[email protected]>2022-01-30 01:19:54 -0500
committerMustafa Quraish <[email protected]>2022-01-30 01:19:54 -0500
commit5b87ec6ef2b84f1d319e5376bcf9eedea9829d79 (patch)
treef1afa80f034a1cd73192510046978435252e1160 /src/lexer.c
parentRemove return-0 example (diff)
downloadcup-5b87ec6ef2b84f1d319e5376bcf9eedea9829d79.tar.xz
cup-5b87ec6ef2b84f1d319e5376bcf9eedea9829d79.zip
Rename `cup` directory to `src`
Diffstat (limited to 'src/lexer.c')
-rw-r--r--src/lexer.c238
1 files changed, 238 insertions, 0 deletions
diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..ea095ba
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,238 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+Lexer Lexer_new(char *filename, char *src, i64 len)
+{
+ Lexer self = {0};
+ self.src = src;
+ self.len = len;
+ self.filename = filename;
+ return self;
+}
+
+static Location Lexer_loc(Lexer *lexer)
+{
+ Location loc = {0};
+ loc.filename = lexer->filename;
+ loc.line = lexer->line;
+ loc.col = lexer->col;
+ return loc;
+}
+
+static void Lexer_skip_whitespace(Lexer *lexer)
+{
+ while (lexer->pos < lexer->len && isspace(lexer->src[lexer->pos])) {
+ if (lexer->src[lexer->pos] == '\n') {
+ lexer->line++;
+ lexer->col = 0;
+ } else {
+ lexer->col++;
+ }
+ lexer->pos++;
+ }
+}
+
+bool Lexer_has_more(Lexer *lexer)
+{
+ Lexer_skip_whitespace(lexer);
+ return lexer->pos < lexer->len;
+}
+
+static bool Lexer_starts_with(Lexer *lexer, char *str)
+{
+ i64 len = strlen(str);
+ if (lexer->len - lexer->pos < len)
+ return false;
+ for (i64 i = 0; i < len; i++)
+ if (lexer->src[lexer->pos + i] != str[i])
+ return false;
+ i64 end_pos = lexer->pos + len;
+ if (end_pos == lexer->len)
+ return true;
+ char end_char = lexer->src[end_pos];
+ return !(isdigit(end_char) || isalpha(end_char) || end_char == '_');
+}
+
+static void advance(Lexer *lexer, i64 amount)
+{
+ lexer->pos += amount;
+ lexer->col += amount;
+}
+
+static char peek(Lexer *lexer, int amount)
+{
+ if (lexer->pos + amount >= lexer->len)
+ return '\0';
+ return lexer->src[lexer->pos + amount];
+}
+
+static Token Lexer_make_token(Lexer *lexer, TokenType type, int inc_amount)
+{
+ Token token = {0};
+ token.type = type;
+ token.loc = Lexer_loc(lexer);
+ advance(lexer, inc_amount);
+ return token;
+}
+
+#define LEX_KEYWORD(str, token_type) \
+ if (Lexer_starts_with(lexer, str)) return Lexer_make_token(lexer, token_type, strlen(str));
+
+Token Lexer_next(Lexer *lexer)
+{
+ while (lexer->pos < lexer->len) {
+ switch (lexer->src[lexer->pos])
+ {
+ case ' ': case '\t': case '\r': advance(lexer, 1); continue;
+ case '\n': lexer->line++; lexer->col = 0; lexer->pos++; continue;
+ case '(': return Lexer_make_token(lexer, TOKEN_OPEN_PAREN, 1);
+ case ')': return Lexer_make_token(lexer, TOKEN_CLOSE_PAREN, 1);
+ case '{': return Lexer_make_token(lexer, TOKEN_OPEN_BRACE, 1);
+ case '}': return Lexer_make_token(lexer, TOKEN_CLOSE_BRACE, 1);
+ case ';': return Lexer_make_token(lexer, TOKEN_SEMICOLON, 1);
+ case ':': return Lexer_make_token(lexer, TOKEN_COLON, 1);
+ case '~': return Lexer_make_token(lexer, TOKEN_TILDE, 1);
+ case '?': return Lexer_make_token(lexer, TOKEN_QUESTION, 1);
+ case ',': return Lexer_make_token(lexer, TOKEN_COMMA, 1);
+
+ case '&': {
+ if (peek(lexer, 1) == '&')
+ return Lexer_make_token(lexer, TOKEN_AND, 2);
+ return Lexer_make_token(lexer, TOKEN_AMPERSAND, 1);
+ }
+
+ case '!': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_NEQ, 2);
+ return Lexer_make_token(lexer, TOKEN_EXCLAMATION, 1);
+ }
+
+ case '<': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_LEQ, 2);
+ return Lexer_make_token(lexer, TOKEN_LT, 1);
+ }
+
+ case '>': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_GEQ, 2);
+ return Lexer_make_token(lexer, TOKEN_GT, 1);
+ }
+
+ case '=': {
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_EQ, 2);
+ return Lexer_make_token(lexer, TOKEN_ASSIGN, 1);
+ }
+
+ case '|': {
+ if (peek(lexer, 1) == '|')
+ return Lexer_make_token(lexer, TOKEN_OR, 2);
+ return Lexer_make_token(lexer, TOKEN_BAR, 1);
+ }
+
+
+ case '+': {
+ if (peek(lexer, 1) == '+')
+ return Lexer_make_token(lexer, TOKEN_PLUSPLUS, 2);
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_PLUSEQUALS, 2);
+ return Lexer_make_token(lexer, TOKEN_PLUS, 1);
+ }
+
+ case '-': {
+ if (peek(lexer, 1) == '-')
+ return Lexer_make_token(lexer, TOKEN_MINUSMINUS, 2);
+ if (peek(lexer, 1) == '=')
+ return Lexer_make_token(lexer, TOKEN_MINUSEQUALS, 2);
+ return Lexer_make_token(lexer, TOKEN_MINUS, 1);
+ }
+
+ case '/': {
+ if (peek(lexer, 1) == '/') {
+ lexer->pos += 2; // skip the '//'
+ while (lexer->pos < lexer->len && lexer->src[lexer->pos] != '\n')
+ lexer->pos++;
+ continue;
+ }
+ return Lexer_make_token(lexer, TOKEN_SLASH, 1);
+ }
+
+ case '*': return Lexer_make_token(lexer, TOKEN_STAR, 1);
+ case '%': return Lexer_make_token(lexer, TOKEN_PERCENT, 1);
+
+
+ default: {
+ // Handle keywords explicitly
+ LEX_KEYWORD("fn", TOKEN_FN);
+ LEX_KEYWORD("if", TOKEN_IF);
+ LEX_KEYWORD("int", TOKEN_INT);
+ LEX_KEYWORD("let", TOKEN_LET);
+ LEX_KEYWORD("for", TOKEN_FOR);
+ LEX_KEYWORD("else", TOKEN_ELSE);
+ LEX_KEYWORD("while", TOKEN_WHILE);
+ LEX_KEYWORD("return", TOKEN_RETURN);
+
+ if (isdigit(lexer->src[lexer->pos])) {
+ // TODO: Parse hex and octal numbers
+ i64 pos = lexer->pos;
+ while (pos < lexer->len && isdigit(lexer->src[pos]))
+ pos++;
+ Token token = Token_from_int(atoi(lexer->src + lexer->pos), Lexer_loc(lexer));
+ advance(lexer, pos - lexer->pos);
+ return token;
+ }
+
+ if (isalpha(lexer->src[lexer->pos]) || lexer->src[lexer->pos] == '_') {
+ i64 pos = lexer->pos;
+ while (pos < lexer->len && (isalnum(lexer->src[pos]) || lexer->src[pos] == '_'))
+ pos++;
+ int str_len = pos - lexer->pos;
+ char *str = calloc(str_len + 1, 1);
+ strncpy(str, lexer->src + lexer->pos, str_len);
+ Token token = Token_from_identifier(str, Lexer_loc(lexer));
+ advance(lexer, str_len);
+ return token;
+ }
+
+ // TODO: Handle escapes
+ if (lexer->src[lexer->pos] == '"') {
+ i64 pos = lexer->pos + 1;
+ while (pos < lexer->len && lexer->src[pos] != '"')
+ pos++;
+ if (pos == lexer->len) {
+ die_location(Lexer_loc(lexer), ": ERROR: Reached end-of-file while parsing string literal beginning here.\n");
+ }
+ // Careful with indexing here, because we want to skip opening and closing quotes
+ char *str = calloc(pos - lexer->pos, 1);
+ strncpy(str, lexer->src + lexer->pos + 1, pos - lexer->pos - 1);
+ Token token = Token_from_identifier(str, Lexer_loc(lexer));
+ advance(lexer, pos - lexer->pos + 1);
+ return token;
+ }
+
+
+ die_location(Lexer_loc(lexer), ": ERROR: Unexpected character '%c'\n", lexer->src[lexer->pos]);
+ advance(lexer, 1);
+ }
+ }
+ }
+
+ return Token_from_type(TOKEN_EOF, Lexer_loc(lexer));
+}
+
+Token Lexer_peek(Lexer *lexer)
+{
+ i64 pos = lexer->pos;
+ i64 col = lexer->col;
+ i64 line = lexer->line;
+ Token token = Lexer_next(lexer);
+ lexer->pos = pos;
+ lexer->col = col;
+ lexer->line = line;
+ return token;
+} \ No newline at end of file