From 3211280414faa41ccaed98cb6400554240ed2ff5 Mon Sep 17 00:00:00 2001 From: ahmedsamyh Date: Fri, 28 Feb 2025 11:08:50 +0500 Subject: [PATCH] [main.c] Can lex numbers, comments, some singlechar tokens... - WIP: Lex operators. --- include/commonlib.h | 23 +++ main.c | 338 +++++++++++++++++++++++++++++++++++++++++--- main.momo | 31 +++- 3 files changed, 370 insertions(+), 22 deletions(-) diff --git a/include/commonlib.h b/include/commonlib.h index 12a63c6..916ac3c 100644 --- a/include/commonlib.h +++ b/include/commonlib.h @@ -50,6 +50,7 @@ #define sv_from_cstr c_sv_from_cstr #define sv_lpop c_sv_lpop #define sv_lpop_until_predicate c_sv_lpop_until_predicate +#define sv_lpop_until_string c_sv_lpop_until_string #define sv_rpop_until_predicate c_sv_rpop_until_predicate #define sv_lpop_until_char c_sv_lpop_until_char #define sv_rpop_until_char c_sv_rpop_until_char @@ -264,6 +265,7 @@ void c_sv_print_dumb(c_String_view sv); c_String_view c_sv_from_cstr(const char* cstr); // Actually just use SV(cstr) macro... c_String_view c_sv_lpop(c_String_view* sv, uint32 n); c_String_view c_sv_lpop_until_predicate(c_String_view* sv, int(*predicate)(int)); +c_String_view c_sv_lpop_until_string(c_String_view* sv, const char *string); c_String_view c_sv_rpop_until_predicate(c_String_view* sv, int(*predicate)(int)); c_String_view c_sv_lpop_until_char(c_String_view* sv, char ch); c_String_view c_sv_rpop_until_char(c_String_view* sv, char ch); @@ -486,6 +488,27 @@ c_String_view c_sv_lpop_until_predicate(c_String_view* sv, int(*predicate)(int)) }; } +c_String_view c_sv_lpop_until_string(c_String_view* sv, const char *string) { + size_t string_len = strlen(string); + + char *old_sv_data = sv->data; + + while (sv->count > string_len) { + bool matched = true; + for (size_t i = 0; i < string_len; ++i) { + if (sv->data[i] != string[i]) matched = false; + } + if (matched) break; + sv->data++; + sv->count--; + } + + return (c_String_view) { + .data = old_sv_data, + .count = (sv->data - old_sv_data), + }; +} + c_String_view c_sv_rpop_until_predicate(c_String_view* sv, int(*predicate)(int)){ size_t old_sv_count = sv->count; while (sv->count > 0 && !predicate(*(sv->data+sv->count-1))){ diff --git a/main.c b/main.c index ff94ca2..fbd0877 100644 --- a/main.c +++ b/main.c @@ -42,18 +42,29 @@ typedef enum { TK_IDENT, TK_KEYWORD, + TK_COMMENT, + TK_MULTILINE_COMMENT, + TK_STRING, TK_LEFT_PAREN, TK_RIGHT_PAREN, TK_MINUS, + TK_MINUS_MINUS, + TK_MINUS_EQUAL, + TK_PLUS, + TK_PLUS_PLUS, + TK_PLUS_EQUAL, TK_RETURNER, TK_LEFT_BRACE, TK_RIGHT_BRACE, - TK_PLUS, TK_DIVIDE, + TK_DIVIDE_EQUAL, TK_MULTIPLY, + TK_MULTIPLY_EQUAL, TK_MODULUS, + TK_MODULUS_EQUAL, + TK_POWER, TK_EQUAL, TK_NOT, TK_NOT_EQUAL, @@ -73,9 +84,12 @@ typedef enum { TK_INT, TK_FLOAT, - TK_BINARY_AND, - TK_BINARY_NOT, - TK_BINARY_OR, + TK_BITWISE_AND, + TK_BITWISE_AND_EQUAL, + TK_BITWISE_NOT, + TK_BITWISE_NOT_EQUAL, + TK_BITWISE_OR, + TK_BITWISE_OR_EQUAL, TK_LOGICAL_AND, TK_LOGICAL_OR, @@ -86,17 +100,27 @@ const char *token_type_as_str(Token_type t) { switch (t) { case TK_IDENT: return "IDENT"; case TK_KEYWORD: return "KEYWORD"; + case TK_COMMENT: return "COMMENT"; + case TK_MULTILINE_COMMENT: return "MULTILINE_COMMENT"; case TK_STRING: return "STRING"; case TK_LEFT_PAREN: return "LEFT_PAREN"; case TK_RIGHT_PAREN: return "RIGHT_PAREN"; case TK_MINUS: return "MINUS"; + case TK_MINUS_MINUS: return "MINUS_MINUS"; + case TK_MINUS_EQUAL: return "MINUS_EQUAL"; + case TK_PLUS: return "PLUS"; + case TK_PLUS_PLUS: return "PLUS_PLUS"; + case TK_PLUS_EQUAL: return "PLUS_EQUAL"; case TK_RETURNER: return "RETURNER"; case TK_LEFT_BRACE: return "LEFT_BRACE"; case TK_RIGHT_BRACE: return "RIGHT_BRACE"; - case TK_PLUS: return "PLUS"; case TK_DIVIDE: return "DIVIDE"; + case TK_DIVIDE_EQUAL: return "DIVIDE_EQUAL"; case TK_MULTIPLY: return "MULTIPLY"; + case TK_MULTIPLY_EQUAL: return "MULTIPLY_EQUAL"; case TK_MODULUS: return "MODULUS"; + case TK_MODULUS_EQUAL: return "MODULUS_EQUAL"; + case TK_POWER: return "POWER"; case TK_EQUAL: return "EQUAL"; case TK_NOT: return "NOT"; case TK_NOT_EQUAL: return "NOT_EQUAL"; @@ -114,9 +138,12 @@ const char *token_type_as_str(Token_type t) { case TK_RIGHT_SQUARE_BRACE: return "RIGHT_SQUARE_BRACE"; case TK_INT: return "INT"; case TK_FLOAT: return "FLOAT"; - case TK_BINARY_AND: return "BINARY_AND"; - case TK_BINARY_NOT: return "BINARY_NOT"; - case TK_BINARY_OR: return "BINARY_OR"; + case TK_BITWISE_AND: return "BITWISE_AND"; + case TK_BITWISE_AND_EQUAL: return "BITWISE_AND_EQUAL"; + case TK_BITWISE_NOT: return "BITWISE_NOT"; + case TK_BITWISE_NOT_EQUAL: return "BITWISE_NOT_EQUAL"; + case TK_BITWISE_OR: return "BITWISE_OR"; + case TK_BITWISE_OR_EQUAL: return "BITWISE_OR_EQUAL"; case TK_LOGICAL_AND: return "LOGICAL_AND"; case TK_LOGICAL_OR: return "LOGICAL_OR"; case TK_COUNT: @@ -160,6 +187,13 @@ const char *keywords[] = { // Yes include will be part of the language, not part of a preprocessor "include" + + "return", + "continue", + "switch", + "break", + "case", + "default", }; bool is_keyword(String_view ident) { @@ -169,6 +203,21 @@ bool is_keyword(String_view ident) { return false; } +// TODO: Should be differentiate hex, octal and binary here too? +Token_type number_token_type(String_view number) { + if (sv_contains_char(number, '.')) { + return TK_FLOAT; + } + + return TK_INT; +} + +// NOTE: We assume only multiline comments contain newlines +Token_type comment_token_type(String_view comment) { + if (sv_contains_char(comment, '\n')) return TK_MULTILINE_COMMENT; + return TK_COMMENT; +} + typedef struct { String_view lexeme; Location loc; @@ -229,22 +278,32 @@ char current_char(Lexer *l) { return l->src.data[l->cur]; } +// NOTE: returns 0 if next char is after EOF +char next_char(Lexer *l) { + if (l->cur+1 >= l->src.count) return 0; + return l->src.data[l->cur+1]; +} + char consume_char(Lexer *l) { char ch = current_char(l); l->cur += 1; return ch; } -int ident_predicate(int ch) { +int not_ident_predicate(int ch) { return !(isalpha(ch) || ch == '_'); } +int not_number_predicate(int ch) { + return !isdigit(ch); +} + void consume_ident(Lexer *l, String_view *ident_sv_out, Location *loc_out) { // Identifiers can start with [a-z][A-Z]_ and contain [0-9] after the first char ASSERT(isalpha(current_char(l)) || current_char(l) == '_', "Called consume_identifier() at the wrong character!"); // NOTE: Since sv operations modify the sv String_view src_copy = get_src_copy(l); - *ident_sv_out = sv_lpop_until_predicate(&src_copy, ident_predicate); + *ident_sv_out = sv_lpop_until_predicate(&src_copy, not_ident_predicate); loc_out->filename = l->filename; loc_out->line = l->line; @@ -293,6 +352,93 @@ void consume_single_char(Lexer *l, String_view *sv_out, Location *loc_out) { l->cur += sv_out->count; } +void consume_number(Lexer *l, String_view *sv_out, Location *loc_out) { + ASSERT(isdigit(current_char(l)), "We expect a number bro..."); + + String_view src_copy = get_src_copy(l); + + *sv_out = sv_lpop_until_predicate(&src_copy, not_number_predicate); + + loc_out->filename = l->filename; + loc_out->line = l->line; + loc_out->col = col(l); + + if (src_copy.data[0] == '.') { + String_view dot_sv = sv_lpop(&src_copy, 1); + + /*info("dot_sv: '"SV_FMT"'", SV_ARG(dot_sv));*/ + + String_view float_sv = sv_lpop_until_predicate(&src_copy, not_number_predicate); + + /*info("float_sv: '"SV_FMT"'", SV_ARG(float_sv));*/ + + // NOTE: We can do this because dot_sv and float_sv is right after sv_out! + sv_out->count += dot_sv.count + float_sv.count; + } + + // Advance by the len of sv + l->cur += sv_out->count; +} + +void consume_comment(Lexer *l, String_view *sv_out, Location *loc_out) { + ASSERT(current_char(l) == '/', "We expect a comment to start with '/'..."); + + // 0 means its after EOF + char next = next_char(l); + consume_char(l); + + switch (next) { + case 0: + case '\n': { + Location loc = { + .filename = l->filename, + .line = l->line, + .col = col(l), + }; + compiler_error(loc, "Unterminated comment!"); + exit(1); + } break; + case '/': { + // Eat / + consume_char(l); + + String_view src_copy = get_src_copy(l); + + *sv_out = sv_lpop_until_char(&src_copy, '\n'); + + loc_out->filename = l->filename; + loc_out->line = l->line; + loc_out->col = col(l); + + // Advance by the len of sv + l->cur += sv_out->count; + + } break; + case '*': { + // Eat * + consume_char(l); + + String_view src_copy = get_src_copy(l); + + *sv_out = sv_lpop_until_string(&src_copy, "*/"); + + loc_out->filename = l->filename; + loc_out->line = l->line; + loc_out->col = col(l); + + // Eat */ + consume_char(l); + consume_char(l); + + // Advance by the len of sv + l->cur += sv_out->count; + } break; + default: { + ASSERT(false, "This shouldnt happen; if it did, you fucked up"); + } break; + } +} + void left_trim(Lexer *l) { while (!eof(l) && isspace(current_char(l))) { // TODO: Care about window's \r\n.... @@ -304,6 +450,24 @@ void left_trim(Lexer *l) { } } +#define LEX_N_CHAR_TOKEN(token_type, lexeme_len) \ + t_out->lexeme = (String_view) {\ + .data = &(l->src.data[l->cur]),\ + .count = lexeme_len,\ + };\ + t_out->type = token_type;\ + t_out->loc = (Location) {\ + .filename = l->filename,\ + .line = l->line,\ + .col = col(l),\ + };\ + print_token(stdout, *t_out);\ + putc('\n', stdout);\ + for (int i = 0; i < lexeme_len; ++i) {\ + consume_char(l);\ + }\ + return true + bool next_token(Lexer *l, Token *t_out) { left_trim(l); @@ -324,7 +488,49 @@ bool next_token(Lexer *l, Token *t_out) { return true; } + if (isdigit(ch)) { + String_view number_sv = {0}; + Location number_loc = {0}; + + consume_number(l, &number_sv, &number_loc); + + t_out->lexeme = number_sv; + t_out->loc = number_loc; + t_out->type = number_token_type(number_sv); + print_token(stdout, *t_out); + putc('\n', stdout); + return true; + } + switch (ch) { + case '/': { + // / could be // /**/ or /= + + char next = next_char(l); + + switch (next) { + case '*': + case '/': { + String_view comment_sv = {0}; + Location comment_loc = {0}; + + consume_comment(l, &comment_sv, &comment_loc); + + t_out->lexeme = comment_sv; + t_out->loc = comment_loc; + t_out->type = comment_token_type(comment_sv); + print_token(stdout, *t_out); + putc('\n', stdout); + + return true; + } break; + case '=': { + LEX_N_CHAR_TOKEN(TK_DIVIDE_EQUAL, 2); + } break; + } + + LEX_N_CHAR_TOKEN(TK_DIVIDE, 1); + } break; case '"': { String_view string_sv = {0}; Location string_loc = {0}; @@ -339,19 +545,109 @@ bool next_token(Lexer *l, Token *t_out) { return true; } break; case ':': { - String_view sv = {0}; - Location loc = {0}; - - consume_single_char(l, &sv, &loc); - - t_out->lexeme = sv; - t_out->loc = loc; - t_out->type = TK_COLON; - print_token(stdout, *t_out); - putc('\n', stdout); + LEX_N_CHAR_TOKEN(TK_COLON, 1); + } break; + case '=': { + LEX_N_CHAR_TOKEN(TK_EQUAL, 1); + } break; + case ';': { + LEX_N_CHAR_TOKEN(TK_SEMICOLON, 1); + } break; + case '#': { + LEX_N_CHAR_TOKEN(TK_HASH, 1); + } break; + case '<': { + LEX_N_CHAR_TOKEN(TK_LT, 1); + } break; + case '>': { + LEX_N_CHAR_TOKEN(TK_GT, 1); + } break; + case '(': { + LEX_N_CHAR_TOKEN(TK_LEFT_PAREN, 1); + } break; + case ')': { + LEX_N_CHAR_TOKEN(TK_RIGHT_PAREN, 1); + } break; + case '{': { + LEX_N_CHAR_TOKEN(TK_LEFT_BRACE, 1); + } break; + case '}': { + LEX_N_CHAR_TOKEN(TK_RIGHT_BRACE, 1); + } break; + case '-': { + // - could be --, -= or -> + char next = next_char(l); + + switch (next) { + case '-': { + LEX_N_CHAR_TOKEN(TK_MINUS_MINUS, 2); + } break; + case '=': { + LEX_N_CHAR_TOKEN(TK_MINUS_EQUAL, 2); + } break; + case '>': { + LEX_N_CHAR_TOKEN(TK_RETURNER, 2); + } break; + } + + LEX_N_CHAR_TOKEN(TK_MINUS, 1); + } break; + case '+': { + // + could be ++ or += + char next = next_char(l); + + switch (next) { + case '+': { + LEX_N_CHAR_TOKEN(TK_PLUS_PLUS, 2); + } break; + case '=': { + LEX_N_CHAR_TOKEN(TK_PLUS_EQUAL, 2); + } break; + } + + LEX_N_CHAR_TOKEN(TK_PLUS, 1); + } break; + case '*': { + // * could be ** or *= + char next = next_char(l); + + switch (next) { + case '*': { + LEX_N_CHAR_TOKEN(TK_POWER, 2); + } break; + case '=': { + LEX_N_CHAR_TOKEN(TK_MULTIPLY_EQUAL, 2); + } break; + } + + LEX_N_CHAR_TOKEN(TK_MULTIPLY, 1); + } break; + case '%': { + // % could be %= + char next = next_char(l); - return true; + switch (next) { + case '=': { + LEX_N_CHAR_TOKEN(TK_MODULUS_EQUAL, 2); + } break; + } + LEX_N_CHAR_TOKEN(TK_MODULUS, 1); + } break; + case '&': { + // & could be && or &= + char next = next_char(l); + + switch (next) { + case '&': { + LEX_N_CHAR_TOKEN(TK_LOGICAL_AND, 2); + } break; + case '=': { + LEX_N_CHAR_TOKEN(TK_BITWISE_AND_EQUAL, 2); + } break; + } + + LEX_N_CHAR_TOKEN(TK_BITWISE_AND, 1); } break; // NOTE: Sanity check case ' ': { diff --git a/main.momo b/main.momo index ee884b9..9540d7d 100644 --- a/main.momo +++ b/main.momo @@ -1 +1,30 @@ -int a = 0; +- +-- +-> ++ +++ ++= +/ +/= +* +** +*= +% +%= + +& +&= +^ +^= +~ +~= +| +|= + +! +!= +&& +|| + += +== -- 2.39.5