diff --git a/CMakeLists.txt b/CMakeLists.txt index 901ac73..718a11b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,12 @@ add_library(lib${PROJECT_NAME} SHARED src/vm/expr.cc src/vm/expr.hh src/vm/location.hh + src/assembler/lexer.cc + src/assembler/lexer.hh + src/assembler/assembler.cc + src/assembler/assembler.hh + src/assembler/as_exceptions.hh + src/bytecode/bc_exceptions.hh ) target_compile_options(lib${PROJECT_NAME} PRIVATE ${warnings}) @@ -88,6 +94,8 @@ target_compile_options(lib${PROJECT_NAME} PRIVATE ${warnings}) # tests # +enable_testing() + add_executable(${PROJECT_NAME}-bytecode-test src/bytecode/tests.cc) target_link_libraries(${PROJECT_NAME}-bytecode-test lib${PROJECT_NAME} gtest_main) add_test(NAME tyche_bytecode_test COMMAND ${PROJECT_NAME}-bytecode-test) @@ -96,6 +104,10 @@ add_executable(${PROJECT_NAME}-vm-test src/vm/tests.cc) target_link_libraries(${PROJECT_NAME}-vm-test lib${PROJECT_NAME} gtest_main) add_test(NAME tyche_vm_test COMMAND ${PROJECT_NAME}-vm-test) +add_executable(${PROJECT_NAME}-as-test src/assembler/tests.cc) +target_link_libraries(${PROJECT_NAME}-as-test lib${PROJECT_NAME} gtest_main) +add_test(NAME tyche_as_test COMMAND ${PROJECT_NAME}-as-test) + # # check for leaks # diff --git a/src/assembler/as_exceptions.hh b/src/assembler/as_exceptions.hh new file mode 100644 index 0000000..9338341 --- /dev/null +++ b/src/assembler/as_exceptions.hh @@ -0,0 +1,18 @@ +#ifndef TYCHE_VM_EXCEPTIONS_HH +#define TYCHE_VM_EXCEPTIONS_HH + +#include +#include + +namespace tyche::as { + +class AssemblyError : public std::runtime_error +{ +public: + explicit AssemblyError(std::string const& str, size_t line, size_t column) + : std::runtime_error((str + " at: line " + std::to_string(line) + ", column: " + std::to_string(column)).c_str()) {} +}; + +} + +#endif //TYCHE_VM_EXCEPTIONS_HH diff --git a/src/assembler/assembler.cc b/src/assembler/assembler.cc new file mode 100644 index 0000000..d189a26 --- /dev/null +++ b/src/assembler/assembler.cc @@ -0,0 +1,98 @@ +#include "assembler.hh" + +#include + +#include "as_exceptions.hh" +#include "../bytecode/bytecode.hh" +#include "../vm/instruction.hh" + +using namespace std::string_literals; + +namespace tyche::as { + +ByteArray Assembler::assemble() +{ + bc::BytecodePrototype bp; + + lexer_.reset(); + + enum class Section { Const, Function } section; + uint32_t function_id = 0; + + for (;;) { + Token t = lexer_.ingest(); + if (t.type == TokenType::Enter) + continue; + + if (t.type == TokenType::Directive) { + if (std::get(t.token) == ".const") { + section = Section::Const; + expect_token(TokenType::Enter); + } else if (std::get(t.token) == ".func") { + section = Section::Function; + function_id = std::get(expect_token(TokenType::Integer)); + if (function_id >= bp.functions.size()) + bp.functions.resize(function_id + 1, { 0, 0 }); + expect_token(TokenType::Enter); + } else { + throw AssemblyError("Invalid directive " + std::get(t.token), t.line, t.column); + } + + } else if (section == Section::Const && t.type == TokenType::Integer) { + int index = std::get(t.token); + if ((size_t) index >= bp.constants.size()) + bp.constants.resize(index + 1); + expect_token(TokenType::Colon); + Token tt = lexer_.ingest(); + if (tt.type == TokenType::Float) + bp.constants[index] = std::get(tt.token); + else if (tt.type == TokenType::String) + bp.constants[index] = std::get(tt.token); + else + throw AssemblyError("Expected float or string as constant", tt.line, tt.column); + expect_token(TokenType::Enter); + + } else if (section == Section::Function && t.type == TokenType::Instruction) { + std::string instruction = std::get(t.token); + std::optional oper = {}; + Token tt = lexer_.ingest(); + if (tt.type == TokenType::Integer) { + oper = std::get(tt.token); + tt = lexer_.ingest(); + } + + auto oinst = vm::translate_instruction(instruction, oper); + if (!oinst) + throw AssemblyError("Invalid or misused instruction '" + instruction + "'", tt.line, tt.column); + + bp.functions.at(function_id).code.append_byte((uint8_t) *oinst); + switch (vm::instruction_operand_type(*oinst)) { + case vm::OperandType::Int8: bp.functions.at(function_id).code.append_int8((int8_t) *oper); break; + case vm::OperandType::Int16: bp.functions.at(function_id).code.append_int16((int16_t) *oper); break; + case vm::OperandType::Int32: bp.functions.at(function_id).code.append_int32(*oper); break; + case vm::OperandType::NoOperand: default: + } + + if (tt.type != TokenType::Enter) + throw AssemblyError("Expected enter", tt.line, tt.column); + + } else if (t.type == TokenType::EOF_) { + break; + + } else if (t.type != TokenType::Enter) { + throw AssemblyError("Unexpected token of type " + token_type_name(t.type) + ")", t.line, t.column); + } + } + + return bc::Bytecode::generate(bp); +} + +TokenValue Assembler::expect_token(TokenType type) +{ + Token t = lexer_.ingest(); + if (t.type != type) + throw AssemblyError("Expected " + token_type_name(t.type), t.line, t.column); + return t.token; +} + +} // tyche diff --git a/src/assembler/assembler.hh b/src/assembler/assembler.hh new file mode 100644 index 0000000..00f87e5 --- /dev/null +++ b/src/assembler/assembler.hh @@ -0,0 +1,27 @@ +#ifndef TYCHE_ASSEMBLER_HH +#define TYCHE_ASSEMBLER_HH + +#include +#include + +#include "lexer.hh" +#include "../common/bytearray.hh" +#include "../bytecode/bytecodeprototype.hh" + +namespace tyche::as { + +class Assembler { +public: + explicit Assembler(std::string source) : lexer_(std::move(source)) {} + + [[nodiscard]] ByteArray assemble(); + +private: + Lexer lexer_; + + TokenValue expect_token(TokenType type); +}; + +} // tyche + +#endif //TYCHE_ASSEMBLER_HH diff --git a/src/assembler/lexer.cc b/src/assembler/lexer.cc new file mode 100644 index 0000000..03b3836 --- /dev/null +++ b/src/assembler/lexer.cc @@ -0,0 +1,120 @@ +#include "lexer.hh" + +#include "as_exceptions.hh" + +namespace tyche::as { + +std::string token_type_name(TokenType type) +{ + switch (type) { + case TokenType::BOF: return "BOF"; + case TokenType::Directive: return "directive"; + case TokenType::Instruction: return "instruction"; + case TokenType::Integer: return "integer"; + case TokenType::Float: return "float"; + case TokenType::String: return "string"; + case TokenType::Enter: return "enter"; + case TokenType::Colon: return "colon"; + case TokenType::EOF_: return "EOF"; + default: return "???"; + } +} + +void Lexer::reset() +{ + pos_ = 0; + ingest_next_token(); +} + +Token Lexer::peek() const +{ + return current_token_; +} + +Token Lexer::ingest() +{ + Token t = current_token_; + ingest_next_token(); + return t; +} + +void Lexer::ingest_next_token() +{ + size_t current_line_pos = 1; + size_t current_line = 1; + + if (pos_ >= source_.size()) { + current_token_ = { TokenType::EOF_ }; + return; + } + + char c = source_.at(pos_); + + TokenType type {}; + std::string stoken; + TokenValue value = std::monostate(); + + if (c == '.') { + type = TokenType::Directive; + stoken += '.'; + while (c = source_.at(++pos_), isalpha(c) || c == '_') + stoken += c; + value = stoken; + } else if (c == '"') { + type = TokenType::String; + ++pos_; + while (true) { + if (source_.at(pos_) == '\\') { // TODO - improve this for special characters + ++pos_; + } else if (source_.at(pos_) == '"') { + ++pos_; + break; + } else if (pos_ >= source_.size()) { + throw AssemblyError("Unterminated string", current_line, pos_ - current_line_pos); + } + stoken += source_.at(pos_++); + } + value = stoken; + } else if (isdigit(c) || c == '-') { + type = TokenType::Integer; + stoken += c; + while (c = source_.at(++pos_), isdigit(c) || c == '.') { + stoken += c; + if (c == '.') { + if (type == TokenType::Integer) + type = TokenType::Float; + else + throw AssemblyError("Double point in floating point number", current_line, pos_ - current_line_pos); + } + } + if (type == TokenType::Integer) + value = std::stoi(stoken); + else + value = std::stof(stoken); + } else if (isalpha(c)) { + type = TokenType::Instruction; + stoken += c; + while (c = source_.at(++pos_), isalpha(c)) + stoken += c; + value = stoken; + } else if (c == ':') { + type = TokenType::Colon; + ++pos_; + } else if (c == '\n') { + type = TokenType::Enter; + value = "\n"; + ++pos_; + ++current_line; + current_line_pos = pos_; + } else { + throw AssemblyError(std::string("Unexpected character '") + c + "' (ascii: " + std::to_string((int) c) + ")", current_line, pos_ - current_line_pos); + } + + // skip ignored tokens + while (pos_ < source_.size() && (source_.at(pos_) == ' ' || source_.at(pos_) == '\t' || source_.at(pos_) == '\r')) + ++pos_; + + current_token_ = { .type = type, .token = value, .line = current_line, .column = pos_ - current_line_pos }; +} + +} // tyche diff --git a/src/assembler/lexer.hh b/src/assembler/lexer.hh new file mode 100644 index 0000000..eab14aa --- /dev/null +++ b/src/assembler/lexer.hh @@ -0,0 +1,45 @@ +#ifndef TYCHE_LEXER_HH +#define TYCHE_LEXER_HH + +#include +#include +#include + +namespace tyche::as { + +enum class TokenType { + BOF, Directive, Instruction, Integer, Float, String, Enter, Colon, EOF_ +}; + +using TokenValue = std::variant; + +struct Token { + TokenType type; + TokenValue token = std::monostate(); + size_t line = 0; + size_t column = 0; + + friend bool operator==(Token const& lhs, Token const& rhs) { return std::tie(lhs.type, lhs.token) == std::tie(rhs.type, rhs.token); } +}; + +std::string token_type_name(TokenType type); + +class Lexer { +public: + explicit Lexer(std::string source) : source_(std::move(source)) { reset(); } + + void reset(); + [[nodiscard]] Token peek() const; + [[nodiscard]] Token ingest(); + +private: + const std::string source_; + size_t pos_ = 0; + Token current_token_ { TokenType::BOF }; + + void ingest_next_token(); +}; + +} // tyche + +#endif //TYCHE_LEXER_HH diff --git a/src/assembler/tests.cc b/src/assembler/tests.cc new file mode 100644 index 0000000..06b446a --- /dev/null +++ b/src/assembler/tests.cc @@ -0,0 +1,76 @@ +#include "assembler.hh" +#include "gtest/gtest.h" + +#include "../bytecode/bytecodeprototype.hh" +#include "../bytecode/bytecode.hh" +#include "../vm/instruction.hh" + +using namespace tyche; +using namespace tyche::as; +using namespace tyche::bc; +using namespace tyche::vm; + +TEST(Lexer, Lexer) +{ + Token t; + Lexer lexer(".dir push 382 -12 3.14 -12.8 \"Hello\" \"Hel\\\"lo\"\n"); + + ASSERT_EQ(lexer.ingest(), (Token { TokenType::Directive, ".dir" })); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::Instruction, "push" })); + t = lexer.ingest(); ASSERT_EQ(t.type, TokenType::Integer); ASSERT_EQ(std::get(t.token), 382); + t = lexer.ingest(); ASSERT_EQ(t.type, TokenType::Integer); ASSERT_EQ(std::get(t.token), -12); + t = lexer.ingest(); ASSERT_EQ(t.type, TokenType::Float); ASSERT_FLOAT_EQ(std::get(t.token), 3.14f); + t = lexer.ingest(); ASSERT_EQ(t.type, TokenType::Float); ASSERT_FLOAT_EQ(std::get(t.token), -12.8f); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::String, "Hello" })); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::String, "Hel\"lo" })); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::Enter, "\n" })); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::EOF_ })); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::EOF_ })); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::EOF_ })); + + lexer.reset(); + ASSERT_EQ(lexer.ingest(), (Token { TokenType::Directive, ".dir" })); +} + +TEST(Assember, Assembler) +{ + BytecodePrototype bp; + bp.constants.emplace_back(3.14f); + bp.constants.emplace_back("Hello world"); + bp.functions.emplace_back(0, 0); + bp.functions.at(0).code.append_byte((uint8_t) Instruction::PushInt8); + bp.functions.at(0).code.append_int8(2); + bp.functions.at(0).code.append_byte((uint8_t) Instruction::PushInt8); + bp.functions.at(0).code.append_int8(3); + bp.functions.at(0).code.append_byte((uint8_t) Instruction::Sum); + bp.functions.at(0).code.append_byte((uint8_t) Instruction::Return); + bp.functions.emplace_back(0, 0); + bp.functions.at(1).code.append_byte((uint8_t) Instruction::PushInt16); + bp.functions.at(1).code.append_int16(5000); + bp.functions.at(1).code.append_byte((uint8_t) Instruction::Return); + ByteArray expected = Bytecode::generate(bp); + + std::string src = R"( +.const + 0: 3.14 + 1: "Hello world" + +.func 0 + pushi 2 + pushi 3 + sum + ret +.func 1 + pushi 5000 + ret +)"; + + ByteArray actual = Assembler(src).assemble(); + ASSERT_EQ(expected, actual); +} + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/bytecode/bc_exceptions.hh b/src/bytecode/bc_exceptions.hh new file mode 100644 index 0000000..f5a83c3 --- /dev/null +++ b/src/bytecode/bc_exceptions.hh @@ -0,0 +1,15 @@ +#ifndef TYCHE_BC_EXCEPTIONS_HH +#define TYCHE_BC_EXCEPTIONS_HH + +#include + +namespace tyche::bc { + +class BytecodeParsingError : public std::runtime_error { +public: + explicit BytecodeParsingError(std::string const& str) : std::runtime_error(str.c_str()) {} +}; + +} + +#endif //TYCHE_BC_EXCEPTIONS_HH diff --git a/src/bytecode/bytecode.cc b/src/bytecode/bytecode.cc index 1dd8d02..996ba2e 100644 --- a/src/bytecode/bytecode.cc +++ b/src/bytecode/bytecode.cc @@ -1,4 +1,6 @@ #include "bytecode.hh" + +#include "bc_exceptions.hh" #include "../common/overloaded.hh" namespace tyche::bc { diff --git a/src/bytecode/bytecodeprototype.hh b/src/bytecode/bytecodeprototype.hh index df92c81..5a1bb58 100644 --- a/src/bytecode/bytecodeprototype.hh +++ b/src/bytecode/bytecodeprototype.hh @@ -14,7 +14,7 @@ struct BytecodePrototype { struct Function { uint16_t n_pars; uint16_t n_locals; - ByteArray code; + ByteArray code {}; Function(uint16_t n_pars_, uint16_t n_locals_) : n_pars(n_pars_), n_locals(n_locals_), code(ByteArray {}) {} }; diff --git a/src/common/bytearray.hh b/src/common/bytearray.hh index 3cf6ef6..c20a011 100644 --- a/src/common/bytearray.hh +++ b/src/common/bytearray.hh @@ -47,15 +47,12 @@ public: [[nodiscard]] std::string hexdump() const; + friend bool operator==(ByteArray const& lhs, ByteArray const& rhs) { return lhs.data_ == rhs.data_; } + private: std::vector data_ {}; }; -class BytecodeParsingError : public std::runtime_error { -public: - explicit BytecodeParsingError(std::string const& str) : std::runtime_error(str.c_str()) {} -}; - } #endif //TYCHE_BYTEARRAY_HH diff --git a/src/vm/instruction.cc b/src/vm/instruction.cc index 5508962..f719fb8 100644 --- a/src/vm/instruction.cc +++ b/src/vm/instruction.cc @@ -1,7 +1,62 @@ #include "instruction.hh" +#include +#include + namespace tyche::vm { +const std::unordered_map instruction_names = { + { "pushi", vm::Instruction::PushInt8 }, + { "pushc", vm::Instruction::PushConstant8 }, + { "pushz", vm::Instruction::PushZero }, + { "pusht", vm::Instruction::PushTrue }, + { "newa", vm::Instruction::NewArray }, + { "newt", vm::Instruction::NewTable }, + { "pop", vm::Instruction::Pop }, + { "dup", vm::Instruction::Duplicate }, + { "setl", vm::Instruction::SetLocal8 }, + { "getl", vm::Instruction::GetLocal8 }, + { "setg", vm::Instruction::SetGlobal8 }, + { "getl", vm::Instruction::GetGlobal8 }, + { "call8", vm::Instruction::Call8 }, + { "ret", vm::Instruction::Return }, + { "retn", vm::Instruction::ReturnNil }, + { "getkv", vm::Instruction::GetKeyValue }, + { "setkv", vm::Instruction::SetKeyValue }, + { "geta", vm::Instruction::GetArrayItem }, + { "seta", vm::Instruction::SetArrayItem }, + { "appnd", vm::Instruction::Append }, + { "next", vm::Instruction::Next }, + { "smt", vm::Instruction::SetMetatable }, + { "mt", vm::Instruction::GetMetatable }, + { "sum", vm::Instruction::Sum }, + { "sub", vm::Instruction::Subtract }, + { "mul", vm::Instruction::Multiply }, + { "div", vm::Instruction::Divide }, + { "idiv", vm::Instruction::DivideInt }, + { "eq", vm::Instruction::Equals }, + { "neq", vm::Instruction::NotEquals }, + { "lt", vm::Instruction::LessThan }, + { "lte", vm::Instruction::LessThanEq }, + { "gt", vm::Instruction::GreaterThan }, + { "gte", vm::Instruction::GreaterThanEq }, + { "and", vm::Instruction::And }, + { "or", vm::Instruction::Or }, + { "xor", vm::Instruction::Xor }, + { "len", vm::Instruction::Len }, + { "type", vm::Instruction::Type }, + { "cast", vm::Instruction::Cast }, + { "ver", vm::Instruction::Version }, + { "bz", vm::Instruction::BranchIfZero8 }, + { "bnz", vm::Instruction::BranchIfNotZero8 }, + { "jmp", vm::Instruction::Jump8 }, + { "cmpl", vm::Instruction::Compile }, + { "asmbl", vm::Instruction::Assemble }, + { "load", vm::Instruction::Load }, +}; + + + std::pair debug_instruction(Instruction inst, int oper) { std::string out; @@ -142,4 +197,27 @@ OperandType instruction_operand_type(Instruction inst) return OperandType::NoOperand; } +std::optional translate_instruction(std::string const& txt, std::optional op) +{ + auto it = instruction_names.find(txt); + if (it == instruction_names.end()) + return {}; + Instruction inst = it->second; + OperandType optype = instruction_operand_type(inst); + + if (optype == OperandType::NoOperand && op) + return {}; + if (optype != OperandType::NoOperand && !op) + return {}; + + if (optype == OperandType::NoOperand) + return inst; + + if (op >= std::numeric_limits::min() && op <= std::numeric_limits::max()) + return inst; + if (op >= std::numeric_limits::min() && op <= std::numeric_limits::max()) + return (Instruction) ((uint8_t) inst + OPCODE_NEXT_SIZE); + return (Instruction) ((uint8_t) inst + (OPCODE_NEXT_SIZE * 2)); +} + } \ No newline at end of file diff --git a/src/vm/instruction.hh b/src/vm/instruction.hh index 1710cdf..e87622a 100644 --- a/src/vm/instruction.hh +++ b/src/vm/instruction.hh @@ -2,6 +2,7 @@ #define TYCHE_INSTRUCTION_HH #include +#include #include #include @@ -9,6 +10,8 @@ namespace tyche::vm { +constexpr uint8_t OPCODE_NEXT_SIZE = 0x20; + enum class Instruction : uint8_t { // stack operations @@ -101,6 +104,8 @@ std::pair debug_instruction(bc::Bytecode const& bt, uint32_ enum class OperandType { NoOperand, Int8, Int16, Int32 }; OperandType instruction_operand_type(Instruction instruction); +std::optional translate_instruction(std::string const& txt, std::optional op); + } #endif //TYCHE_INSTRUCTION_HH