From: ahmedsamyh Date: Fri, 28 Feb 2025 03:58:06 +0000 (+0500) Subject: [main.c] Can lex strings and keywords. X-Git-Url: https://www.git.momoyon.org/?a=commitdiff_plain;h=6af5b31d270cb5a2f1da6197f34810da85e842cb;p=lang.git [main.c] Can lex strings and keywords. --- diff --git a/main.c b/main.c index cb39307..ff94ca2 100644 --- a/main.c +++ b/main.c @@ -23,6 +23,12 @@ void print_loc(FILE *f, Location loc) { fprintf(f, "%s:%d:%d", loc.filename, loc.line, loc.col); } +#define compiler_error(loc, fmt, ...) do { \ + print_loc(stderr, loc);\ + putc(' ', stderr);\ + error(fmt, ##__VA_ARGS__);\ + } while (0) + typedef struct { // NOTE: src gets data from a heap allocated string!!! String_view src; @@ -33,50 +39,53 @@ typedef struct { } Lexer; typedef enum { - TK_IDENT, - TK_STRING, - - TK_LEFT_PAREN, - TK_RIGHT_PAREN, - TK_MINUS, - TK_RETURNER, - TK_LEFT_BRACE, - TK_RIGHT_BRACE, - TK_PLUS, - TK_DIVIDE, - TK_MULTIPLY, - TK_MODULUS, - TK_EQUAL, - TK_NOT, - TK_NOT_EQUAL, - TK_EQUAL_EQUAL, - TK_GT, - TK_LT, - TK_GTE, - TK_LTE, - TK_COMMA, - TK_COLON, - TK_SEMICOLON, - TK_DOT, - TK_HASH, - TK_LEFT_SQUARE_BRACE, - TK_RIGHT_SQUARE_BRACE, - - TK_INT, - TK_FLOAT, - - TK_BINARY_AND, - TK_BINARY_NOT, - TK_BINARY_OR, - TK_LOGICAL_AND, - TK_LOGICAL_OR, - - TK_COUNT, + TK_IDENT, + TK_KEYWORD, + + TK_STRING, + + TK_LEFT_PAREN, + TK_RIGHT_PAREN, + TK_MINUS, + TK_RETURNER, + TK_LEFT_BRACE, + TK_RIGHT_BRACE, + TK_PLUS, + TK_DIVIDE, + TK_MULTIPLY, + TK_MODULUS, + TK_EQUAL, + TK_NOT, + TK_NOT_EQUAL, + TK_EQUAL_EQUAL, + TK_GT, + TK_LT, + TK_GTE, + TK_LTE, + TK_COMMA, + TK_COLON, + TK_SEMICOLON, + TK_DOT, + TK_HASH, + TK_LEFT_SQUARE_BRACE, + TK_RIGHT_SQUARE_BRACE, + + TK_INT, + TK_FLOAT, + + TK_BINARY_AND, + TK_BINARY_NOT, + TK_BINARY_OR, + TK_LOGICAL_AND, + TK_LOGICAL_OR, + + TK_COUNT, } Token_type; const char *token_type_as_str(Token_type t) { switch (t) { case TK_IDENT: return "IDENT"; + case TK_KEYWORD: return "KEYWORD"; case TK_STRING: return "STRING"; case TK_LEFT_PAREN: return "LEFT_PAREN"; case TK_RIGHT_PAREN: return "RIGHT_PAREN"; @@ -117,6 +126,49 @@ const char *token_type_as_str(Token_type t) { } } +const char *keywords[] = { + "int", + "int8", + "int16", + "int32", + "int64", + + "uint", + "uint8", + "uint16", + "uint32", + "uint64", + + "float", + "float32", + "float64", + "char", + "string", + "bool", + + "if", + "else", + + "for", + "while", + + "fun", + + "enum", + "struct", + "union", + + // Yes include will be part of the language, not part of a preprocessor + "include" +}; + +bool is_keyword(String_view ident) { + for (size_t i = 0; i < ARRAY_LEN(keywords); ++i) { + if (sv_equals(ident, SV(keywords[i]))) return true; + } + return false; +} + typedef struct { String_view lexeme; Location loc; @@ -156,6 +208,14 @@ void free_lexer(Lexer *l) { free(l->src.data); } +String_view get_src_copy(Lexer *l) { + String_view src_copy = { + .data = l->src.data + l->cur, + .count = l->src.count - l->cur, + }; + return src_copy; +} + bool eof(Lexer *l) { return l->cur >= l->src.count; } @@ -183,11 +243,7 @@ void consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out) { // Identifiers can start with [a-z][A-Z]_ and contain [0-9] after the first char ASSERT(isalpha(current_char(l)) || current_char(l) == '_', "Called consume_identifier() at the wrong character!"); // NOTE: Since sv operations modify the sv - String_view src_copy = { - .data = l->src.data + l->cur, - .count = l->src.count - l->cur, - }; - + String_view src_copy = get_src_copy(l); *ident_sv_out = sv_lpop_until_predicate(&src_copy, ident_predicate); loc_out->filename = l->filename; @@ -198,6 +254,45 @@ void consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out) { l->cur += ident_sv_out->count; } +void consume_string(Lexer *l, String_view *string_sv_out, Location *loc_out) { + ASSERT(current_char(l) == '"', "We except '\"' to be the current_char here..."); + + // Eat " + consume_char(l); + String_view src_copy = get_src_copy(l); + + *string_sv_out = sv_lpop_until_char(&src_copy, '"'); + + loc_out->filename = l->filename; + loc_out->line = l->line; + loc_out->col = col(l); + + // Advance by the len of sv + l->cur += string_sv_out->count; + + if (eof(l)) { + compiler_error(*loc_out, "Unterminated string!"); + exit(1); + } + + // Eat " + consume_char(l); +} + +void consume_single_char(Lexer *l, String_view *sv_out, Location *loc_out) { + + String_view src_copy = get_src_copy(l); + + *sv_out = sv_lpop(&src_copy, 1); + + loc_out->filename = l->filename; + loc_out->line = l->line; + loc_out->col = col(l); + + // Advance by the len of sv + l->cur += sv_out->count; +} + void left_trim(Lexer *l) { while (!eof(l) && isspace(current_char(l))) { // TODO: Care about window's \r\n.... @@ -223,7 +318,7 @@ bool next_token(Lexer *l, Token *t_out) { t_out->lexeme = ident_sv; t_out->loc = ident_loc; - t_out->type = TK_IDENT; + t_out->type = (is_keyword(ident_sv) ? TK_KEYWORD : TK_IDENT); print_token(stdout, *t_out); putc('\n', stdout); return true; @@ -231,6 +326,32 @@ bool next_token(Lexer *l, Token *t_out) { switch (ch) { case '"': { + String_view string_sv = {0}; + Location string_loc = {0}; + consume_string(l, &string_sv, &string_loc); + + t_out->lexeme = string_sv; + t_out->loc = string_loc; + t_out->type = TK_STRING; + print_token(stdout, *t_out); + putc('\n', stdout); + + return true; + } break; + case ':': { + String_view sv = {0}; + Location loc = {0}; + + consume_single_char(l, &sv, &loc); + + t_out->lexeme = sv; + t_out->loc = loc; + t_out->type = TK_COLON; + print_token(stdout, *t_out); + putc('\n', stdout); + + return true; + } break; // NOTE: Sanity check case ' ': {