diff --git a/TODO.md b/TODO.md index 88c7594..d2f4882 100644 --- a/TODO.md +++ b/TODO.md @@ -7,6 +7,7 @@ - [ ] Bytecode - Add/retrive all types of data - Keeps no memory except for caching + - [ ] Refactor bytecode code - [ ] Bytecode debugging info - [ ] Bytecode loader - Combine multiple chunks diff --git a/doc/BYTECODE b/doc/BYTECODE index c696aeb..3305954 100644 --- a/doc/BYTECODE +++ b/doc/BYTECODE @@ -3,22 +3,23 @@ Bytecode format The bytecode file is composed of the following sections: - * 16-byte header - [0:3]: Magic - [4]: VM format - * Index: pointers to each one of the sections, up to 8 - Each pointer: 4 bytes - Each count: 4 bytes - * [0x0] Constants indexes: all constants (such as strings) used in the code + * HEADER: 16-byte header + [0:3]: Magic + [4]: VM format + [rest]: Reserved for future use + * TABLE_OF_CONTENTS: list of 8 records pointing to each one of the sections + Each record (6 bytes): + - Pointer to section: 4 bytes + - Number of records in section: 2 bytes + * [0x0] Constants indexes: pointers to each of the constant locations * Table of 4-byte constant indexes with pointer to constant + (counter start at beginning of raw constants) * [0x1] Functions indexes: Pointer to functions within the code - [0:3]: function pointer + [0:3]: function pointer (counter start at the beginning of executable code) [4:5]: number of parameters [6:7]: number of local variables * [0x2] Constants raw data * [0x3] Code: executable code - [1-byte]: operation - [variable]: operand (see value encoding below) * [0x4] Debugging info ??? diff --git a/src/bytecode/bytearray.cc b/src/bytecode/bytearray.cc index c3c5a0c..fcd4858 100644 --- a/src/bytecode/bytearray.cc +++ b/src/bytecode/bytearray.cc @@ -55,6 +55,12 @@ void ByteArray::set_string(uint32_t addr, std::string const& str) set_byte(addr, 0); } +void ByteArray::set_bytearray(uint32_t addr, ByteArray const& bytearray) +{ + for (uint8_t byte: bytearray.data()) + set_byte(addr++, byte); +} + uint8_t ByteArray::get_byte(uint32_t addr) const { return data_.at(addr); @@ -90,7 +96,7 @@ std::pair ByteArray::get_int(uint32_t addr) const throw BytecodeParsingError("Error parsing int32 at position " + std::to_string(addr)); } -std::pair ByteArray::get_float(uint32_t addr) const +float ByteArray::get_float(uint32_t addr) const { uint32_t bits = (uint32_t) get_byte(addr) | (uint32_t) get_byte(addr+1) << 8 @@ -98,7 +104,7 @@ std::pair ByteArray::get_float(uint32_t addr) const | (uint32_t) get_byte(addr+3) << 24; float value; std::memcpy(&value, &bits, 4); - return { value, 4 }; + return value; } std::pair ByteArray::get_string(uint32_t addr) const diff --git a/src/bytecode/bytearray.hh b/src/bytecode/bytearray.hh index d1906ad..99984ce 100644 --- a/src/bytecode/bytearray.hh +++ b/src/bytecode/bytearray.hh @@ -19,6 +19,7 @@ public: void set_int(uint32_t addr, int32_t value); void set_float(uint32_t addr, float value); void set_string(uint32_t addr, std::string const& str); + void set_bytearray(uint32_t addr, ByteArray const& bytearray); void append_byte(uint8_t byte) { set_byte(data_.size(), byte); } void append_uint16(uint16_t value) { set_uint16(data_.size(), value); } @@ -32,7 +33,7 @@ public: [[nodiscard]] uint16_t get_uint16(uint32_t addr) const; [[nodiscard]] uint32_t get_uint32(uint32_t addr) const; [[nodiscard]] std::pair get_int(uint32_t addr) const; - [[nodiscard]] std::pair get_float(uint32_t addr) const; + [[nodiscard]] float get_float(uint32_t addr) const; [[nodiscard]] std::pair get_string(uint32_t addr) const; [[nodiscard]] std::vector const& data() const { return data_; } diff --git a/src/bytecode/bytecode.cc b/src/bytecode/bytecode.cc index 8cbde31..59c2336 100644 --- a/src/bytecode/bytecode.cc +++ b/src/bytecode/bytecode.cc @@ -7,20 +7,24 @@ Bytecode::Bytecode(ByteArray ba) : byte_array_(std::move(ba)) { // check file size - if (byte_array_.size() < (HEADER_SZ + INDEX_SZ)) + if (byte_array_.size() < (TOC_START + TOC_SZ)) throw BytecodeParsingError("Invalid bytecode format (file too short)"); // check magic number and version - if (byte_array_.get_uint32(0) != MAGIC) + if (byte_array_.get_uint32(0) != MAGIC_NUMBER) throw BytecodeParsingError("Invalid bytecode format (magic number not matching)"); - if (byte_array_.get_uint32(4) != VERSION) + if (byte_array_.get_uint32(4) != BYTECODE_VERSION) throw BytecodeParsingError("Unexpected bytecode format version"); // load cache - cache_.constants_idx_addr = byte_array_.get_uint32(HEADER_SZ); - cache_.n_constants = byte_array_.get_uint16(HEADER_SZ + 4); - cache_.functions_idx_addr = byte_array_.get_uint32(HEADER_SZ + 6); - cache_.n_functions = byte_array_.get_uint16(HEADER_SZ + 10); + cache_.constants_idx_addr = byte_array_.get_uint32(TOC_START); + cache_.n_constants = byte_array_.get_uint16(TOC_START + 4); + cache_.constants_start_addr = byte_array_.get_uint32(TOC_START + (6 * 2)); + cache_.functions_idx_addr = byte_array_.get_uint32(TOC_START + 6); + cache_.n_functions = byte_array_.get_uint16(TOC_START + 10); + uint32_t code_start = byte_array_.get_uint32(TOC_START + (6 * 3)); + for (uint32_t i = 0; i < cache_.n_functions; ++i) + cache_.function_addr.emplace_back(code_start + byte_array_.get_uint32(cache_.functions_idx_addr + (i * 8))); } uint32_t Bytecode::n_constants() const @@ -36,117 +40,111 @@ uint32_t Bytecode::n_functions() const int32_t Bytecode::get_constant_int(uint32_t idx) const { uint32_t constant_idx = byte_array_.get_uint32(cache_.constants_idx_addr + (idx * 4)); - return byte_array_.get_int(constant_idx).first; + return byte_array_.get_int(cache_.constants_start_addr + constant_idx).first; } float Bytecode::get_constant_float(uint32_t idx) const { - return 0; + uint32_t constant_idx = byte_array_.get_uint32(cache_.constants_idx_addr + (idx * 4)); + return byte_array_.get_float(cache_.constants_start_addr + constant_idx); } std::string Bytecode::get_constant_string(uint32_t idx) const { - return std::string(); + uint32_t constant_idx = byte_array_.get_uint32(cache_.constants_idx_addr + (idx * 4)); + return byte_array_.get_string(cache_.constants_start_addr + constant_idx).first; } Bytecode::FunctionDef Bytecode::get_function_def(uint32_t function_id) const { - return Bytecode::FunctionDef(); + uint32_t idx = cache_.functions_idx_addr + (function_id * 8); + return { + .n_params = byte_array_.get_uint16(idx + 4), + .locals = byte_array_.get_uint16(idx + 6), + }; } uint8_t Bytecode::get_code_byte(uint32_t function_id, uint32_t idx) const { - return 0; + return byte_array_.get_byte(cache_.function_addr.at(function_id) + idx); } -int32_t Bytecode::get_code_int(uint32_t function_id, uint32_t idx) const +std::pair Bytecode::get_code_int(uint32_t function_id, uint32_t idx) const { - return 0; + return byte_array_.get_int(cache_.function_addr.at(function_id) + idx); } float Bytecode::get_code_float(uint32_t function_id, uint32_t idx) const { - return 0; + return byte_array_.get_float(cache_.function_addr.at(function_id) + idx); } ByteArray Bytecode::generate(BytecodePrototype const& bp) { - ByteArray ba; + // header section + ByteArray header; + header.set_uint32(0, MAGIC_NUMBER); + header.set_byte(4, BYTECODE_VERSION); // constants - std::vector constant_table; - ByteArray constant_array; + ByteArray constant_indexes; + ByteArray raw_constants; + uint32_t idx = 0; for (auto const& constant: bp.constants) { - constant_table.emplace_back(idx); + constant_indexes.append_uint32(idx); std::visit(overloaded { - [&](int32_t i) { constant_array.append_int(i); }, - [&](float f) { constant_array.append_float(f); }, - [&](std::string const& s) { constant_array.append_string(s); }, + [&](int32_t i) { raw_constants.append_int(i); }, + [&](float f) { raw_constants.append_float(f); }, + [&](std::string const& s) { raw_constants.append_string(s); }, }, constant); - idx = constant_array.size(); - } - - // function table - std::vector> functions; - ByteArray code; - for (auto const& f: bp.functions) { - code.append_bytearray(f.code); - functions.emplace_back(std::make_pair(FunctionDef { f.n_pars, f.n_locals }, code.size())); - } - - // - // build binary - // - - // header - ba.set_uint32(0, MAGIC); - ba.set_byte(4, VERSION); - - // constants - idx = HEADER_SZ + INDEX_SZ; - for (auto const& const_idx: constant_table) { - ba.set_uint32(idx, const_idx); - idx += 4; - } - uint32_t constants_raw_data_addr = ba.size(); - ba.append_bytearray(constant_array); - - // constant index - if (!constant_table.empty()) { - ba.set_uint32(HEADER_SZ, HEADER_SZ + INDEX_SZ); - ba.set_uint16(HEADER_SZ + 4, constant_table.size()); + idx = raw_constants.size(); } // functions - size_t functions_start = idx + (constant_table.size() * 4); - idx = functions_start; - uint32_t code_idx = 0; - for (auto const& f: functions) { - ba.set_uint32(idx, code_idx); - ba.set_uint16(idx + 4, f.first.n_params); - ba.set_uint16(idx + 6, f.first.locals); - idx += 8; - code_idx += f.second; - } - uint32_t functions_raw_data_addr = ba.size(); - for (auto const& f: bp.functions) - ba.append_bytearray(f.code); + ByteArray functions_indexes; + ByteArray raw_code; - // function index - if (!functions.empty()) { - ba.set_uint32(HEADER_SZ + 6, functions_start); - ba.set_uint16(HEADER_SZ + 6 + 4, functions.size()); + uint32_t idx_idx = 0, code_idx = 0; + for (auto const& f: bp.functions) { + functions_indexes.set_uint32(idx_idx, code_idx); + functions_indexes.set_uint16(idx_idx + 4, f.n_pars); + functions_indexes.set_uint16(idx_idx + 6, f.n_locals); + raw_code.append_bytearray(f.code); + code_idx = raw_code.size(); + idx_idx += FUNCTION_RECORD_SZ; } - // constants raw data - if (!constant_table.empty()) - ba.set_uint32(HEADER_SZ + (2 * 6), constants_raw_data_addr); + // table of contents + uint32_t function_idx_start = CONST_IDX_START + constant_indexes.size(); + uint32_t raw_constant_start = function_idx_start + functions_indexes.size(); + uint32_t raw_code_start = raw_constant_start + raw_constants.size(); - // function raw data - if (!functions.empty()) - ba.set_uint32(HEADER_SZ + (3 * 6), functions_raw_data_addr); + ByteArray toc; + if (!bp.constants.empty()) { + toc.set_uint32(SEC_CONST_IDX * TOC_RECORD_SZ, CONST_IDX_START); + toc.set_uint32(SEC_CONST_IDX * TOC_RECORD_SZ + 4, constant_indexes.size() / CONST_RECORD_SZ); + toc.set_uint32(SEC_CONST_DATA * TOC_RECORD_SZ, raw_constant_start); + toc.set_uint32(SEC_CONST_DATA * TOC_RECORD_SZ + 4, raw_constants.size()); + } + if (!bp.functions.empty()) { + toc.set_uint32(SEC_FUNC_IDX * TOC_RECORD_SZ, function_idx_start); + toc.set_uint32(SEC_FUNC_IDX * TOC_RECORD_SZ + 4, functions_indexes.size() / FUNCTION_RECORD_SZ); + toc.set_uint32(SEC_CODE * TOC_RECORD_SZ, raw_code_start); + toc.set_uint32(SEC_CODE * TOC_RECORD_SZ + 4, raw_code.size()); + } + // + // assemble bytecode + // + + ByteArray ba; + ba.set_bytearray(0, header); + ba.set_bytearray(TOC_START, toc); + ba.set_bytearray(CONST_IDX_START, constant_indexes); + ba.set_bytearray(function_idx_start, functions_indexes); + ba.set_bytearray(raw_constant_start, raw_constants); + ba.set_bytearray(raw_code_start, raw_code); return ba; } diff --git a/src/bytecode/bytecode.hh b/src/bytecode/bytecode.hh index b20caf9..f9ba207 100644 --- a/src/bytecode/bytecode.hh +++ b/src/bytecode/bytecode.hh @@ -20,9 +20,9 @@ public: struct FunctionDef { uint16_t n_params, locals; }; [[nodiscard]] FunctionDef get_function_def(uint32_t function_id) const; - [[nodiscard]] uint8_t get_code_byte(uint32_t function_id, uint32_t idx) const; - [[nodiscard]] int32_t get_code_int(uint32_t function_id, uint32_t idx) const; - [[nodiscard]] float get_code_float(uint32_t function_id, uint32_t idx) const; + [[nodiscard]] uint8_t get_code_byte(uint32_t function_id, uint32_t idx) const; + [[nodiscard]] std::pair get_code_int(uint32_t function_id, uint32_t idx) const; + [[nodiscard]] float get_code_float(uint32_t function_id, uint32_t idx) const; // TODO - debugging info @@ -31,10 +31,17 @@ public: private: ByteArray byte_array_; // the actual data - static constexpr uint8_t VERSION = 1; - static constexpr uint32_t MAGIC = 0x74b3c138; - static constexpr uint32_t HEADER_SZ = 16, - INDEX_SZ = 8 * 6; + static constexpr uint8_t BYTECODE_VERSION = 1; + static constexpr uint32_t MAGIC_NUMBER = 0x74b3c138; + static constexpr uint32_t TOC_START = 16, + TOC_N_RECORDS = 8, + TOC_RECORD_SZ = 8, + TOC_SZ = TOC_N_RECORDS * TOC_RECORD_SZ; + static constexpr uint32_t CONST_IDX_START = TOC_START + TOC_SZ, + CONST_RECORD_SZ = 4; + static constexpr uint32_t FUNCTION_RECORD_SZ = 8; + + enum Sections { SEC_CONST_IDX = 0, SEC_FUNC_IDX = 1, SEC_CONST_DATA = 2, SEC_CODE = 3 }; // caching for faster reading of data struct Cache { @@ -43,7 +50,7 @@ private: uint32_t constants_start_addr; uint32_t functions_idx_addr; uint32_t n_functions; - uint32_t functions_start_addr; + std::vector function_addr; }; Cache cache_ {}; }; diff --git a/src/bytecode/tests.cc b/src/bytecode/tests.cc index b387f0f..cdedf1c 100644 --- a/src/bytecode/tests.cc +++ b/src/bytecode/tests.cc @@ -44,9 +44,9 @@ TEST(ByteArray, ByteArray) ba.set_int(1, 5000300); ASSERT_EQ(ba.get_int(1), std::make_pair(5000300, 4)); ba.set_int(1, -5000300); ASSERT_EQ(ba.get_int(1), std::make_pair(-5000300, 4)); - ba.set_float(1, 3.14); ASSERT_FLOAT_EQ(ba.get_float(1).first, 3.14); - ba.set_float(1, -3.14); ASSERT_FLOAT_EQ(ba.get_float(1).first, -3.14); - ba.set_float(1, -5000300.1324); ASSERT_FLOAT_EQ(ba.get_float(1).first, -5000300.1324); + ba.set_float(1, 3.14); ASSERT_FLOAT_EQ(ba.get_float(1), 3.14); + ba.set_float(1, -3.14); ASSERT_FLOAT_EQ(ba.get_float(1), -3.14); + ba.set_float(1, -5000300.1324); ASSERT_FLOAT_EQ(ba.get_float(1), -5000300.1324); ba.set_string(1, "Hello world!"); ASSERT_EQ(ba.get_string(1), std::make_pair("Hello world!", 13)); @@ -67,14 +67,14 @@ TEST(Bytecode, Constants) 0x00, 0x00, 0x00, 0x00, // index - 0x40, 0x00, 0x00, 0x00, 0x02, 0x00, // constants - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // functions - 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, // constant index + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // function undex + 0x58, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, // raw constants + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // raw code + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // constant indexes 0x00, 0x00, 0x00, 0x00, @@ -85,6 +85,7 @@ TEST(Bytecode, Constants) }; ByteArray ba = Bytecode::generate(bp); + print(ba.data()); print(expected); ASSERT_EQ(ba.data(), expected); } @@ -106,14 +107,14 @@ TEST(Bytecode, Code) 0x00, 0x00, 0x00, 0x00, // index - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // constants - 0x40, 0x00, 0x00, 0x00, 0x02, 0x00, // variables - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // constant index + 0x50, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, // variable index + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // raw constants + 0x60, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, // raw code + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // function definitions 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -142,8 +143,8 @@ TEST(Bytecode, Parsing) f.code.append_byte(0x68); f.code.append_int(42); - auto& f2 = bp.functions.emplace_back(2, 1); - f2.code.append_byte(0x42); + auto& ff = bp.functions.emplace_back(2, 1); + ff.code.append_byte(0x42); ByteArray ba = Bytecode::generate(bp); print(ba.data()); @@ -158,6 +159,18 @@ TEST(Bytecode, Parsing) ASSERT_EQ(bc.get_constant_int(0), 42); ASSERT_FLOAT_EQ(bc.get_constant_float(1), 3.14f); ASSERT_EQ(bc.get_constant_string(2), "HELLO"); + + Bytecode::FunctionDef f1 = bc.get_function_def(0); + ASSERT_EQ(f1.n_params, 0); + ASSERT_EQ(f1.locals, 0); + + Bytecode::FunctionDef f2 = bc.get_function_def(1); + ASSERT_EQ(f2.n_params, 2); + ASSERT_EQ(f2.locals, 1); + + ASSERT_EQ(bc.get_code_byte(0, 0), 0x68); + ASSERT_EQ(bc.get_code_int(0, 1), std::make_pair(42, 1)); + ASSERT_EQ(bc.get_code_byte(1, 0), 0x42); } int main(int argc, char** argv)