diff --git a/compiler/common.hpp b/compiler/common.hpp index 03dcbfa..5d28d0b 100644 --- a/compiler/common.hpp +++ b/compiler/common.hpp @@ -26,13 +26,14 @@ namespace lmx { std::cerr << "Failed to open file " << path << std::endl; return {}; } - return std::string(std::istreambuf_iterator{file}, std::istreambuf_iterator{}); + return {std::istreambuf_iterator{file}, std::istreambuf_iterator{}}; } inline std::shared_ptr compile(const std::string &path) { std::string code = read_file(path); if (code.empty()) return nullptr; Lexer lexer(code); auto tks = lexer.tokenize(code); + if (lexer.has_err) return nullptr; Parser parser(tks); if (auto node = parser.parse_program(); node && !parser.has_error()) return node; diff --git a/compiler/lexer.cpp b/compiler/lexer.cpp index d7d9b57..c17e416 100644 --- a/compiler/lexer.cpp +++ b/compiler/lexer.cpp @@ -3,127 +3,104 @@ // #include "lexer.hpp" +#include "../tools/lm/debug.hpp" +#include +#include #include #include +#include namespace lmx { std::ostream& operator<<(std::ostream& os, const Token& t) { - os << "Token("; - switch (t.type) { - case TokenType::END_OF_FILE: os << "END_OF_FILE"; break; - case TokenType::IDENTIFIER: os << "IDENTIFIER"; break; - case TokenType::NUM_LITERAL: os << "INT_LITERAL"; break; - case TokenType::STRING_LITERAL: os << "STRING_LITERAL"; break; - case TokenType::COMMA: os << "COMMA"; break; - case TokenType::TRUE_LITERAL: os << "TRUE_LITERAL"; break; - case TokenType::FALSE_LITERAL: os << "FALSE_LITERAL"; break; - case TokenType::OPER_PLUS: os << "OPER_PLUS"; break; - case TokenType::OPER_MINUS: os << "OPER_MINUS"; break; - case TokenType::OPER_MUL: os << "OPER_MUL"; break; - case TokenType::OPER_DIV: os << "OPER_DIV"; break; - case TokenType::OPER_MOD: os << "OPER_MOD"; break; - case TokenType::EQ: os << "EQ"; break; - case TokenType::GE: os << "GE"; break; - case TokenType::GT: os << "GT"; break; - case TokenType::LE: os << "LE"; break; - case TokenType::LT: os << "LT"; break; - case TokenType::COLON: os << "COLON"; break; - case TokenType::COL_COLON: os << "COL_COLON"; break; - case TokenType::OPER_POW: os << "OPER_POW"; break; - case TokenType::ASSIGN: os << "ASSIGN"; break; - case TokenType::NOT: os << "NOT"; break; - case TokenType::NE: os << "NE"; break; - case TokenType::LPAREN: os << "LPAREN"; break; - case TokenType::RPAREN: os << "RPAREN"; break; - case TokenType::LBRACK: os << "LBRACK"; break; - case TokenType::RBRACK: os << "RBRACK"; break; - case TokenType::LBRACE: os << "LBRACE"; break; - case TokenType::RBRACE: os << "RBRACE"; break; - - case TokenType::UNKNOWN: os << "UNKNOWN"; break; - case TokenType::KW_FUNC: os << "KEYWORD_FUNC"; break; - case TokenType::KW_RETURN: os << "KEYWORD_RETURN"; break; - default: os << "UNKNOWN"; - } - os << ", " << t.text << ", " << t.line << ", " << t.col << ')'; + os << "Token(" << to_string(t.type) + << ", " << t.text << ", " << t.line << ", " << t.col << ')'; + LOG(ITIS(t.text) << ", " << ITIS(t.line) << ", " << ITIS(t.col)); return os; } void Lexer::advance() { pos++; - if (src[pos] == '\n') { + if (content[pos] == '\n') { line++; col = 1; } else col++; } +bool Lexer::valid_pos() const { + return pos < content.size(); +} + Token Lexer::next() { - while (isspace(src[pos])) { + while (isspace(content[pos])) { advance(); + LOG("advance!"); + } + if (pos >= content.size()) { + LOG("Directly EOF!"); + return {TokenType::END_OF_FILE,"", line, col}; } - if (pos >= src.size()) return {TokenType::END_OF_FILE,"", line, col}; - switch (src[pos]) { + switch (content[pos]) { case '+': { advance(); - return {TokenType::OPER_PLUS, "+", line, col}; + return {TokenType::OPER_PLUS, "+", line, col - 1}; } case '-': { advance(); - return {TokenType::OPER_MINUS, "-", line, col}; + return {TokenType::OPER_MINUS, "-", line, col - 1}; } case '*': { advance(); - return {TokenType::OPER_MUL, "*", line, col}; + return {TokenType::OPER_MUL, "*", line, col - 1}; } case '/': { advance(); - return {TokenType::OPER_DIV, "/", line, col}; + return {TokenType::OPER_DIV, "/", line, col - 1}; } case '%': { advance(); - return {TokenType::OPER_MOD, "%", line, col}; + return {TokenType::OPER_MOD, "%", line, col - 1}; } case '=': { advance(); - if (src[pos] == '=') { + if (content[pos] == '=') { advance(); - return {TokenType::EQ, "==", line, col}; + return {TokenType::EQ, "==", line, col - 1}; } - return {TokenType::ASSIGN, "=", line, col}; + return {TokenType::ASSIGN, "=", line, col - 1}; } case '>': { advance(); - if (src[pos] == '=') { + if (content[pos] == '=') { advance(); - return {TokenType::GE, ">=", line, col}; + return {TokenType::GE, ">=", line, col - 1}; } - return {TokenType::GT, ">", line, col}; + return {TokenType::GT, ">", line, col - 1}; } case '<': { advance(); - if (src[pos] == '=') { + if (content[pos] == '=') { advance(); - return {TokenType::LE, "<=", line, col}; + return {TokenType::LE, "<=", line, col - 1}; } - return {TokenType::LT, "<", line, col}; + return {TokenType::LT, "<", line, col - 1}; } case ':': { advance(); - if (src[pos] == ':') { + if (content[pos] == ':') { advance(); - return {TokenType::COL_COLON, "::", line, col}; + return {TokenType::COL_COLON, "::", line, col - 1}; } - return {TokenType::COLON, ":", line, col}; + return {TokenType::COLON, ":", line, col - 1}; } case '^': { advance(); - return {TokenType::OPER_POW, "^", line, col}; + return {TokenType::OPER_POW, "^", line, col - 1}; } case '#': { - while (pos <= src.size() && src[pos] != '\n' ) + while (pos <= content.size() && content[pos] != '\n' ) advance(); advance(); return {TokenType::COMMENT, {}, line, col}; @@ -131,10 +108,13 @@ Token Lexer::next() { case '"': { advance(); std::string str; - while (src[pos] != '"') { - if (src[pos] == '\\') { + if (!valid_pos()) return {TokenType::UNKNOWN, "", line, col - 1}; + + while (valid_pos() && content[pos] != '"') { + if (content[pos] == '\\') { advance(); - switch (src[pos]) { + if (!valid_pos()) return {TokenType::UNKNOWN, "", line, col - 1}; + switch (content[pos]) { case 'n': str += '\n'; break; case 't': str += '\t'; break; case 'r': str += '\r'; break; @@ -142,97 +122,100 @@ Token Lexer::next() { case 'f': str += '\f'; break; case 'v': str += '\v'; break; case '0': str += '\0'; break; - default: str += src[pos]; break; + default: str += content[pos]; break; } advance(); continue; } - str += src[pos]; + str += content[pos]; advance(); } - advance(); + if (!valid_pos() || content[pos] != '"') + return {TokenType::UNKNOWN, str, line, col - str.size() - 1}; - return {TokenType::STRING_LITERAL, str, line, col - str.size()}; + advance(); + LOG("Content of the STRING_LITERAL: " << str); + return {TokenType::STRING_LITERAL, str, line, col - str.size() - 1}; } case '(': { advance(); - return {TokenType::LPAREN, "(", line, col}; + return {TokenType::LPAREN, "(", line, col - 1}; } case ')': { advance(); - return {TokenType::RPAREN, ")", line, col}; + return {TokenType::RPAREN, ")", line, col - 1}; } case '{': { advance(); - return {TokenType::LBRACE, "{", line, col}; + return {TokenType::LBRACE, "{", line, col - 1}; } case '}': { advance(); - return {TokenType::RBRACE, "}", line, col}; + return {TokenType::RBRACE, "}", line, col - 1}; } case '[': { advance(); - return {TokenType::LBRACK, "[", line, col}; + return {TokenType::LBRACK, "[", line, col - 1}; } case ']': { advance(); - return {TokenType::RBRACK, "]", line, col}; + return {TokenType::RBRACK, "]", line, col - 1}; } case ',': { advance(); - return {TokenType::COMMA, ", ", line, col}; + return {TokenType::COMMA, ", ", line, col - 1}; } case '!': { advance(); - if (src[pos] == '=') { + if (content[pos] == '=') { advance(); - return {TokenType::NE, "!=", line, col}; + return {TokenType::NE, "!=", line, col - 1}; } - return {TokenType::NOT, "!", line, col}; + return {TokenType::NOT, "!", line, col - 1}; } case '|': { advance(); - if (src[pos] == '>') { + if (content[pos] == '>') { advance(); - return {TokenType::PIPE, "|>", line, col}; + return {TokenType::PIPE, "|>", line, col - 1}; } - if (src[pos] == '|') { + if (content[pos] == '|') { advance(); - return {TokenType::OR, "||", line, col}; + return {TokenType::OR, "||", line, col - 1}; } - return {TokenType::UNKNOWN, std::string(1, src[pos]), line, col}; + return {TokenType::UNKNOWN, std::string(1, content[pos]), line, col - 1}; } case '&': { advance(); - if (src[pos] == '&') { + if (content[pos] == '&') { advance(); - return {TokenType::AND, "&&", line, col}; + return {TokenType::AND, "&&", line, col - 1}; } - return {TokenType::UNKNOWN, std::string(1, src[pos]), line, col}; + return {TokenType::UNKNOWN, std::string(1, content[pos]), line, col - 1}; } case '.': { advance(); - return {TokenType::DOT, ".", line, col}; + return {TokenType::DOT, ".", line, col - 1}; } default: { - if (isdigit(src[pos])) { - auto cur_line = line, cur_col = col; + if (isdigit(content[pos])) { + const auto cur_line = line, cur_col = col; std::string num; - while (isdigit(src[pos]) || src[pos] == '_') { - if (src[pos] == '_') { + while (isdigit(content[pos]) || content[pos] == '_') { + if (content[pos] == '_') { advance(); continue; } - num += src[pos]; + num += content[pos]; advance(); } return {TokenType::NUM_LITERAL, num, cur_line, cur_col}; } - if (isalpha(src[pos]) || src[pos] == '_') { + if (isalpha(content[pos]) || content[pos] == '_') { std::string id; auto cur_line = line, cur_col = col; - while (isalnum(src[pos])|| src[pos] == '_') { - id += src[pos]; + while (isalnum(content[pos])|| content[pos] == '_') { + id += content[pos]; advance(); } static const std::unordered_map keywords = { @@ -256,23 +239,79 @@ Token Lexer::next() { } } - auto token = Token{TokenType::UNKNOWN, std::string(1, src[pos]), line, col}; + auto token = Token{TokenType::UNKNOWN, std::string(1, content[pos]), line, col}; advance(); + LOG("Will be UNKNOWN: Token: " << ITIS(token.col) << ", " << ITIS(token.line)); return token; } -std::vector Lexer::tokenize(const std::string& new_src) { - src = new_src; +std::vector Lexer::tokenize(const std::string& code) { + content = code; + has_err = false; pos = 0; - line = 1; + LOG(ITIS(line)); + const auto orig_line = line; + line += [&]() -> size_t { + if (content.empty()) return 0; + return std::ranges::count(content, '\n') + 1; + }(); + LOG("now: " << ITIS(line) << ", " << ITIS(orig_line) << ", " << ITIS(col)); col = 1; std::vector tokens; - while (pos < src.size()) { + while (pos < content.size()) { tokens.push_back(next()); + LOG("Pushing..."); } - // Add EOF token at the end - tokens.push_back({TokenType::END_OF_FILE, "", line, col}); - return tokens; + if (tokens.empty() || tokens.back().type != TokenType::END_OF_FILE) tokens.push_back({TokenType::END_OF_FILE, "", line, col}); + for ([[maybe_unused]] auto const &token : tokens) { + LOG(ITIS(token)); + } + const std::string res = error(tokens, orig_line); + if (res.empty()) return tokens; + LOG("Error!"); + has_err = true; + std::cerr << res << std::endl; + return {}; +} + +std::string Lexer::error(const std::vector& tokens, const size_t origin_lineno) { + std::string k; + + std::unordered_map> line_errors; + for (const auto& token : tokens) { + if (token.type == TokenType::UNKNOWN) { + line_errors[token.line].push_back(token.col); + } + } + + for (const auto& [lineno, cols] : line_errors) { + std::string line_content = [&]{ + std::istringstream iss(content); + std::string line_con; + for (size_t i = 0; i < lineno - origin_lineno - 1; i++) { + if (!std::getline(iss, line_con)) return std::string(""); + } + std::getline(iss, line_con); + return line_con; + }(); + + auto caret_line = [&]{ + if (cols.empty()) return std::string{}; + std::string str(cols.back() + 1, ' '); + for (const size_t col : cols) str[col - 1] = '^'; + return str; + }(); + + k += std::format( + "In line {}, file {}:\n>>> {}\n {}\n", + lineno, + filename, + line_content, + caret_line + ); + } + + return k; } } diff --git a/compiler/lexer.hpp b/compiler/lexer.hpp index 995481e..1875d35 100644 --- a/compiler/lexer.hpp +++ b/compiler/lexer.hpp @@ -4,12 +4,15 @@ #pragma once #include +#include #include #include "../include/lmx_export.hpp" +#include + namespace lmx { -enum class /*LMC_API*/ TokenType { +enum class TokenType { END_OF_FILE, OPER_PLUS, OPER_MINUS, OPER_MUL, OPER_DIV, OPER_MOD, OPER_POW, @@ -33,6 +36,44 @@ enum class /*LMC_API*/ TokenType { COMMENT, }; +inline std::string to_string(const TokenType& type) { + switch (type) { + case TokenType::END_OF_FILE: return "END_OF_FILE"; + case TokenType::IDENTIFIER: return "IDENTIFIER"; + case TokenType::NUM_LITERAL: return "INT_LITERAL"; + case TokenType::STRING_LITERAL: return "STRING_LITERAL"; + case TokenType::COMMA: return "COMMA"; + case TokenType::TRUE_LITERAL: return "TRUE_LITERAL"; + case TokenType::FALSE_LITERAL: return "FALSE_LITERAL"; + case TokenType::OPER_PLUS: return "OPER_PLUS"; + case TokenType::OPER_MINUS: return "OPER_MINUS"; + case TokenType::OPER_MUL: return "OPER_MUL"; + case TokenType::OPER_DIV: return "OPER_DIV"; + case TokenType::OPER_MOD: return "OPER_MOD"; + case TokenType::EQ: return "EQ"; + case TokenType::GE: return "GE"; + case TokenType::GT: return "GT"; + case TokenType::LE: return "LE"; + case TokenType::LT: return "LT"; + case TokenType::COLON: return "COLON"; + case TokenType::COL_COLON: return "COL_COLON"; + case TokenType::OPER_POW: return "OPER_POW"; + case TokenType::ASSIGN: return "ASSIGN"; + case TokenType::NOT: return "NOT"; + case TokenType::NE: return "NE"; + case TokenType::LPAREN: return "LPAREN"; + case TokenType::RPAREN: return "RPAREN"; + case TokenType::LBRACK: return "LBRACK"; + case TokenType::RBRACK: return "RBRACK"; + case TokenType::LBRACE: return "LBRACE"; + case TokenType::RBRACE: return "RBRACE"; + case TokenType::UNKNOWN: return "UNKNOWN"; + case TokenType::KW_FUNC: return "KEYWORD_FUNC"; + case TokenType::KW_RETURN: return "KEYWORD_RETURN"; + default: return "_NOT_IMPLEMENTED"; + } +} + struct LMC_API Token { TokenType type; std::string text; @@ -43,14 +84,36 @@ struct LMC_API Token { class LMC_API Lexer { size_t pos{0}, line{1}, col{1}; + std::string& content, filename; + Token next(); void advance(); - std::string& src; + [[nodiscard]] bool valid_pos() const; - Token next(); public: - explicit Lexer(std::string& src): src(src) {}; - std::vector tokenize(const std::string &new_src); + explicit Lexer(std::string& code, std::string filename = ""): content(code), filename(std::move(filename)) {} + std::string error(const std::vector& tokens, size_t origin_lineno); + std::vector tokenize(const std::string &code); + + bool has_err{false}; }; -} \ No newline at end of file +} + +template<> +struct std::formatter { + static constexpr auto parse(const format_parse_context& ctx) { + return ctx.begin(); + } + + static auto format(const lmx::Token& token, format_context& ctx) { + return format_to( + ctx.out(), + "Token({}, '{}', {}, {})", + to_string(token.type), + token.text, + token.line, + token.col + ); + } +}; \ No newline at end of file diff --git a/tools/lm/common/file_run.cpp b/tools/lm/common/file_run.cpp index 10f7587..7e6fdf5 100644 --- a/tools/lm/common/file_run.cpp +++ b/tools/lm/common/file_run.cpp @@ -30,20 +30,21 @@ int file_run(const std::string& file_name) { if (magic == LMX_MAGIC_NUM) return binary_run(std::move(file)); file.seekg(0, std::ios::beg); auto src = std::string(std::istreambuf_iterator(file), std::istreambuf_iterator()); - lmx::Lexer lexer(src); + lmx::Lexer lexer(src, file_name); auto ts = lexer.tokenize(src); + if (lexer.has_err) return -1; lmx::Parser parser(ts, src, file_name); - lmx::Generator gener; + lmx::Generator generator; std::shared_ptr node; node = parser.parse_program(); if (!node || parser.has_error()) return -1; - gener.gen(node); - if (lmx::Generator::node_has_error)return -1; - gener.ops.emplace_back(lmx::runtime::Opcode::HALT); - gener.write_binary_file(file_name); + generator.gen(node); + if (lmx::Generator::node_has_error) return -1; + generator.ops.emplace_back(lmx::runtime::Opcode::HALT); + generator.write_binary_file(file_name); lmx::runtime::VirtualCore vm; - vm.set_program(&gener.ops); - vm.set_constant(gener.constant_pool.data()); + vm.set_program(&generator.ops); + vm.set_constant(generator.constant_pool.data()); return vm.run(); } diff --git a/tools/lm/common/repl.cpp b/tools/lm/common/repl.cpp index ad4cfaf..6942627 100644 --- a/tools/lm/common/repl.cpp +++ b/tools/lm/common/repl.cpp @@ -6,16 +6,18 @@ #include "../compiler/generator/generator.hpp" #include "../compiler/generator/emit.hpp" #include "../runtime/vm.hpp" +#include "../tools/lm/debug.hpp" int run_repl() { std::string input; - lmx::Lexer l(input); + lmx::Lexer l(input, ""); lmx::Generator generator; lmx::runtime::VirtualCore core; core.set_program(&generator.ops); const std::string prompt = "\033[35m>>> \033[0m"; while (true) { + LOG("Getting input"); std::cout << prompt << std::flush; if (!std::getline(std::cin, input)) break; if (input == ":lastret") std::cout << core.look_register(0) << std::endl; @@ -24,6 +26,7 @@ int run_repl() { else if (input == ":vars") generator.print_vars(); else { auto tks = l.tokenize(input); + if (l.has_err) continue; lmx::Parser parser(tks, input, ""); auto node = parser.parse();