From: momoyon <momoyon@momoyon.org>
Date: Wed, 28 May 2025 19:23:46 +0000 (+0500)
Subject: [main.c] Refactor line lexing and error reporting while lexing.
X-Git-Url: https://www.git.momoyon.org/?a=commitdiff_plain;h=d19a079bd430e5f2716124131618770faf43b277;p=lang.git

[main.c] Refactor line lexing and error reporting while lexing.
---

diff --git a/main.c b/main.c
index c7aa755..e6d8a25 100644
--- a/main.c
+++ b/main.c
@@ -123,7 +123,7 @@ static bool DEBUG_PRINT = false;
 /// NOTE: Location
 typedef struct {
     const char *filename;
-    int line;
+    int line; // NOTE: Lines start from 1!!!!!!!
     int col;
 } Location;
 
@@ -131,6 +131,15 @@ void print_loc(FILE *f, Location loc);
 
 ///
 
+/// NOTE: Error
+#define ERROR_BUF_CAP (1024)
+typedef struct {
+    char buf[ERROR_BUF_CAP];
+    Location loc;
+} Error;
+
+///
+
 /// NOTE: Token
 
 typedef enum {
@@ -225,8 +234,8 @@ typedef struct {
 /// NOTE: Lexer
 
 typedef struct {
-    size_t offset;
-    size_t count;
+    size_t bol;
+    size_t eol;
 } Line;
 
 typedef struct {
@@ -240,26 +249,27 @@ typedef struct {
     String_view src;
     size_t cur;
     size_t bol; // Beginning of Line
-    size_t line;
+    size_t current_line;
     Lines lines;
     const char *filename;
 } Lexer;
 
 Lexer make_lexer(const char *filename);
 Tokens lex(Lexer *l);
-bool next_token(Lexer *l, Token *t_out);
+void init_lexer_lines(Lexer *l);
+bool next_token(Lexer *l, Token *t_out, Error *err);
 String_view get_src_copy(Lexer *l);
 bool eof(Lexer *l);
 int col(Lexer *l);
 char current_char(Lexer *l);
 char next_char(Lexer *l);
 char consume_char(Lexer *l);
-void consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out);
-void consume_string(Lexer *l, String_view *string_sv_out, Location *loc_out);
-void consume_character(Lexer *l, String_view *char_sv_out, Location *loc_out);
-void consume_single_char(Lexer *l, String_view *sv_out, Location *loc_out);
-void consume_number(Lexer *l, String_view *sv_out, Location *loc_out);
-void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out);
+bool consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out, Error *err);
+bool consume_string(Lexer *l, String_view *string_sv_out, Location *loc_out, Error *err);
+bool consume_character(Lexer *l, String_view *char_sv_out, Location *loc_out, Error *err);
+bool consume_single_char(Lexer *l, String_view *sv_out, Location *loc_out, Error *err);
+bool consume_number(Lexer *l, String_view *sv_out, Location *loc_out, Error *err);
+bool consume_comment(Lexer *l, String_view *sv_out, Location *loc_out, Error *err);
 void left_trim(Lexer *l);
 void free_lexer(Lexer *l);
 
@@ -488,7 +498,7 @@ void print_loc(FILE *f, Location loc) {
         putc(' ', stderr);\
         ASSERT(0 <= ((loc).line-1) && (size_t)((loc).line-1) <= (lexer).lines.count-1, "Should be in range");\
         Line line = (lexer).lines.items[(loc).line-1];\
-        String_view line_sv = sv_get_part((lexer).src, line.offset, line.offset + line.count);\
+        String_view line_sv = sv_get_part((lexer).src, line.bol, line.eol);\
         error(fmt, ##__VA_ARGS__);\
         printf(SV_FMT"\n", SV_ARG(line_sv));\
         printf("%*s^\n", (loc).col, "");\
@@ -1288,6 +1298,17 @@ AST *parse(Arena *arena, Parser *p) {
 Lexer make_lexer(const char *filename) {
     int file_size = -1;
     const char *buf = read_file(filename, &file_size);
+
+    int cr_count = 0;
+    for (int i = 0; i < file_size; ++i) {
+        if (buf[i] == '\r') {
+            cr_count++;
+        }
+    }
+    if (cr_count > 0) {
+        log_warning("%d Carriage Returns found!", cr_count);
+        exit(1);
+    }
     if (file_size == -1) {
         exit(1);
     }
@@ -1295,7 +1316,7 @@ Lexer make_lexer(const char *filename) {
         .src = sv_from_cstr(buf),
         .cur = 0,
         .bol = 0,
-        .line = 1,
+        .current_line = 1,
         .filename = filename,
     };
 
@@ -1377,7 +1398,8 @@ int not_number_or_ident_predicate(int ch) {
     return !(isalpha(ch) || ch == '_' || isdigit(ch));
 }
 
-void consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out) {
+bool consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out, Error *err) {
+    (void)err;
     // Identifiers can start with [a-z][A-Z]_ and contain [0-9] after the first char
     ASSERT(isalpha(current_char(l)) || current_char(l) == '_', "Called consume_identifier() at the wrong character!");
     // NOTE: Since sv operations modify the sv
@@ -1390,16 +1412,17 @@ void consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out) {
         ident_sv_out->count += rest_of_the_ident.count;
     }
 
-
     loc_out->filename = l->filename;
-    loc_out->line     = l->line;
+    loc_out->line     = l->current_line;
     loc_out->col      = col(l);
 
     // Advance by the len of ident
     l->cur += ident_sv_out->count;
+
+    return true;
 }
 
-void consume_string(Lexer *l, String_view *string_sv_out, Location *loc_out) {
+bool consume_string(Lexer *l, String_view *string_sv_out, Location *loc_out, Error *err) {
     ASSERT(current_char(l) == '"', "We except '\"' to be the current_char here...");
 
     // Eat "
@@ -1409,22 +1432,25 @@ void consume_string(Lexer *l, String_view *string_sv_out, Location *loc_out) {
     *string_sv_out = sv_lpop_until_char(&src_copy, '"');
 
     loc_out->filename = l->filename;
-    loc_out->line     = l->line;
+    loc_out->line     = l->current_line;
     loc_out->col      = col(l);
 
     // Advance by the len of sv
     l->cur += string_sv_out->count;
 
     if (eof(l)) {
-        error_pretty((*loc_out), (*l), "Unterminated string!");
-        exit(1);
+        err->loc = *loc_out;
+        stbsp_snprintf(err->buf, ERROR_BUF_CAP, "Unterminted String");
+        return false;
     }
 
     // Eat "
     consume_char(l);
+
+    return true;
 }
 
-void consume_character(Lexer *l, String_view *char_sv_out, Location *loc_out) {
+bool consume_character(Lexer *l, String_view *char_sv_out, Location *loc_out, Error *err) {
     ASSERT(current_char(l) == '\'', "We except '\'' to be the current_char here...");
 
     // Eat '
@@ -1433,41 +1459,53 @@ void consume_character(Lexer *l, String_view *char_sv_out, Location *loc_out) {
     String_view src_copy = get_src_copy(l);
 
     loc_out->filename = l->filename;
-    loc_out->line     = l->line;
+    loc_out->line     = l->current_line;
     loc_out->col      = col(l);
 
     *char_sv_out = sv_lpop(&src_copy, 1);
 
     l->cur += 1;
 
-    if (current_char(l) != '\'') {
-        error_pretty(*loc_out, *l, "Expected `'`, but got `%ch`", current_char(l));
-        exit(1);
-    }
     if (eof(l)) {
-        error_pretty(*loc_out, *l, "Unterminated char!");
-        exit(1);
+        err->loc = *loc_out;
+        stbsp_snprintf(err->buf, ERROR_BUF_CAP, "Unterminated char!");
+        return false;
+    }
+    if (current_char(l) != '\'') {
+        err->loc = *loc_out;
+        char c = current_char(l);
+        if (c == '\n')
+            stbsp_snprintf(err->buf, ERROR_BUF_CAP, "Expected `'`, but got `EOF`");
+        else
+            stbsp_snprintf(err->buf, ERROR_BUF_CAP, "Expected `'`, but got `%c`", current_char(l));
+        return false;
     }
 
+
     // Eat '
     consume_char(l);
-}
 
-void consume_single_char(Lexer *l, String_view *sv_out, Location *loc_out) {
+    return true;
+}
 
+bool consume_single_char(Lexer *l, String_view *sv_out, Location *loc_out, Error *err) {
+    (void)err;
     String_view src_copy = get_src_copy(l);
 
     *sv_out = sv_lpop(&src_copy, 1);
 
     loc_out->filename = l->filename;
-    loc_out->line     = l->line;
+    loc_out->line     = l->current_line;
     loc_out->col      = col(l);
 
     // Advance by the len of sv
     l->cur += sv_out->count;
+
+    return true;
 }
 
-void consume_number(Lexer *l, String_view *sv_out, Location *loc_out) {
+bool consume_number(Lexer *l, String_view *sv_out, Location *loc_out, Error *err) {
+    (void)err;
     ASSERT(isdigit(current_char(l)), "We expect a number bro...");
 
     String_view src_copy = get_src_copy(l);
@@ -1475,7 +1513,7 @@ void consume_number(Lexer *l, String_view *sv_out, Location *loc_out) {
     *sv_out = sv_lpop_until_predicate(&src_copy, not_number_predicate);
 
     loc_out->filename = l->filename;
-    loc_out->line     = l->line;
+    loc_out->line     = l->current_line;
     loc_out->col      = col(l);
 
     if (src_copy.data[0] == '.') {
@@ -1493,9 +1531,11 @@ void consume_number(Lexer *l, String_view *sv_out, Location *loc_out) {
 
     // Advance by the len of sv
     l->cur += sv_out->count;
+
+    return true;
 }
 
-void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out) {
+bool consume_comment(Lexer *l, String_view *sv_out, Location *loc_out, Error *err) {
     ASSERT(current_char(l) == '/', "We expect a comment to start with '/'...");
 
     // 0 means its after EOF
@@ -1507,11 +1547,12 @@ void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out) {
         case '\n': {
             Location loc = {
                 .filename = l->filename,
-                .line = l->line,
+                .line = l->current_line,
                 .col = col(l),
             };
-            error_pretty(loc, *l, "Unterminated comment!");
-            exit(1);
+            err->loc = loc;
+            stbsp_snprintf(err->buf, ERROR_BUF_CAP, "Unterminated comment!");
+            return false;
         } break;
         case '/': {
             // Eat /
@@ -1522,7 +1563,7 @@ void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out) {
             *sv_out = sv_lpop_until_char(&src_copy, '\n');
 
             loc_out->filename = l->filename;
-            loc_out->line     = l->line;
+            loc_out->line     = l->current_line;
             loc_out->col      = col(l);
 
             // Advance by the len of sv
@@ -1538,7 +1579,7 @@ void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out) {
             *sv_out = sv_lpop_until_string(&src_copy, "*/");
 
             loc_out->filename = l->filename;
-            loc_out->line     = l->line;
+            loc_out->line     = l->current_line;
             loc_out->col      = col(l);
 
             // Eat */
@@ -1552,20 +1593,13 @@ void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out) {
             ASSERT(false, "This shouldnt happen; if it did, you fucked up");
         } break;
     }
+
+    return true;
 }
 
-// TODO: Somehow remove '\r's because that is causing the col of EOF to be one off...
 void left_trim(Lexer *l) {
     while (!eof(l) && isspace(current_char(l))) {
-        if (current_char(l) == '\n') {
-            Line line = {
-                .offset = l->bol,
-                .count = col(l),
-            };
-            da_append(l->lines, line);
-            l->line += 1;
-            l->bol = l->cur + 1;
-        }
+        if (current_char(l) == '\n') l->current_line++;
         consume_char(l);
     }
 }
@@ -1578,7 +1612,7 @@ void left_trim(Lexer *l) {
     t_out->type = token_type;\
     t_out->loc = (Location) {\
         .filename = l->filename,\
-        .line = l->line,\
+        .line = l->current_line,\
         .col = col(l),\
     };\
     if (DEBUG_PRINT) {\
@@ -1590,7 +1624,42 @@ void left_trim(Lexer *l) {
     }\
     return true
 
-bool next_token(Lexer *l, Token *t_out) {
+void init_lexer_lines(Lexer *l) {
+    int nl_count = 0;
+    int bol = 0;
+    while (!eof(l)) {
+        char c = consume_char(l);
+        if (c == '\n') {
+            nl_count++;
+            Line line = {
+                .bol = bol,
+                .eol = l->cur-1, // NOTE: l->cur is the _NEXT_ char here because we consume the char above!!!
+            };
+
+            // log_debug("L %zu ~ %zu", line.bol, line.eol);
+            da_append(l->lines, line);
+
+            bol = line.eol + 1;
+        }
+    }
+    // log_debug("Newline count: %d", nl_count);
+    // log_debug("lines count: %zu", l->lines.count);
+
+    // log_debug("SRC: `"SV_FMT"`", SV_ARG(l->src));
+    // for (size_t i = 0; i < l->lines.count; ++i) {
+    //     char eol = l->src.data[l->lines.items[i].eol];
+    //     if (eol != '\n') log_debug("EOL expected but got: %c", eol);
+    //     ASSERT(eol == '\n', "This should be a newline!");
+    //     String_view line_sv = sv_get_part(l->src, l->lines.items[i].bol, l->lines.items[i].eol);
+    //     log_debug("L%zu: `"SV_FMT"`", i, SV_ARG(line_sv));
+    // }
+
+    // NOTE: Reset the cursor so we can lex
+    ASSERT(nl_count == l->lines.count, "Newline count and lines count should match!");
+    l->cur = 0;
+}
+
+bool next_token(Lexer *l, Token *t_out, Error *err) {
     left_trim(l);
 
     if (eof(l)) return false;
@@ -1600,7 +1669,7 @@ bool next_token(Lexer *l, Token *t_out) {
     if (isalpha(ch) || ch == '_') {
         String_view ident_sv = {0};
         Location ident_loc = {0};
-        consume_ident(l, &ident_sv, &ident_loc);
+        consume_ident(l, &ident_sv, &ident_loc, err);
 
         t_out->lexeme = ident_sv;
         t_out->loc    = ident_loc;
@@ -1621,7 +1690,7 @@ bool next_token(Lexer *l, Token *t_out) {
         String_view number_sv = {0};
         Location number_loc = {0};
 
-        consume_number(l, &number_sv, &number_loc);
+        consume_number(l, &number_sv, &number_loc, err);
 
         t_out->lexeme = number_sv;
         t_out->loc    = number_loc;
@@ -1645,7 +1714,7 @@ bool next_token(Lexer *l, Token *t_out) {
                     String_view comment_sv = {0};
                     Location comment_loc = {0};
 
-                    consume_comment(l, &comment_sv, &comment_loc);
+                    consume_comment(l, &comment_sv, &comment_loc, err);
 
                     t_out->lexeme = comment_sv;
                     t_out->loc    = comment_loc;
@@ -1667,7 +1736,7 @@ bool next_token(Lexer *l, Token *t_out) {
         case '"': {
             String_view string_sv = {0};
             Location string_loc = {0};
-            consume_string(l, &string_sv, &string_loc);
+            consume_string(l, &string_sv, &string_loc, err);
 
             t_out->lexeme = string_sv;
             t_out->loc    = string_loc;
@@ -1682,7 +1751,7 @@ bool next_token(Lexer *l, Token *t_out) {
         case '\'': {
             String_view char_sv = {0};
             Location char_loc = {0};
-            consume_character(l, &char_sv, &char_loc);
+            consume_character(l, &char_sv, &char_loc, err);
             t_out->lexeme = char_sv;
             t_out->loc    = char_loc;
             t_out->type   = TK_CHAR;
@@ -1899,17 +1968,25 @@ bool next_token(Lexer *l, Token *t_out) {
 Tokens lex(Lexer *l) {
     Tokens tokens = {0};
     Token t = {0};
-    while (next_token(l, &t)) {
+    Error err = {0};
+
+    while (next_token(l, &t, &err)) {
         da_append(tokens, t);
     }
+
     Line last_line = l->lines.items[l->lines.count-1];
+    t.loc.col = last_line.eol - last_line.bol;
+    t.loc.line = l->current_line-1;
     t.lexeme = SV("EOF");
     t.loc.filename = l->filename;
-    t.loc.line = l->line-1;
-    t.loc.col = last_line.count;
     t.type = TK_EOF;
     da_append(tokens, t);
 
+    if (*err.buf != '\0') {
+        error_pretty(err.loc, *l, "%s", err.buf);
+        exit(1);
+    }
+
     return tokens;
 }
 
@@ -1993,6 +2070,7 @@ int main(int argc, char **argv) {
         return 1;
     }
 
+    init_lexer_lines(&l);
     Tokens tokens = lex(&l);
 
     if (dump_tokens) {
@@ -2005,7 +2083,6 @@ int main(int argc, char **argv) {
 
     Parser p = make_parser(&l, tokens);
 
-    // TODO: Reallocing arena is causing memory issues
     Arena arena = arena_make(32*1024);
 
     AST_refs ast_refs = {0};