This commit is contained in:
2026-04-28 16:51:19 -05:00
parent c17788eeab
commit 8f5f470edd
7 changed files with 146 additions and 119 deletions

View File

@@ -7,6 +7,7 @@
- [ ] Bytecode
- Add/retrive all types of data
- Keeps no memory except for caching
- [ ] Refactor bytecode code
- [ ] Bytecode debugging info
- [ ] Bytecode loader
- Combine multiple chunks

View File

@@ -3,22 +3,23 @@ Bytecode format
The bytecode file is composed of the following sections:
* 16-byte header
[0:3]: Magic
[4]: VM format
* Index: pointers to each one of the sections, up to 8
Each pointer: 4 bytes
Each count: 4 bytes
* [0x0] Constants indexes: all constants (such as strings) used in the code
* HEADER: 16-byte header
[0:3]: Magic
[4]: VM format
[rest]: Reserved for future use
* TABLE_OF_CONTENTS: list of 8 records pointing to each one of the sections
Each record (6 bytes):
- Pointer to section: 4 bytes
- Number of records in section: 2 bytes
* [0x0] Constants indexes: pointers to each of the constant locations
* Table of 4-byte constant indexes with pointer to constant
(counter start at beginning of raw constants)
* [0x1] Functions indexes: Pointer to functions within the code
[0:3]: function pointer
[0:3]: function pointer (counter start at the beginning of executable code)
[4:5]: number of parameters
[6:7]: number of local variables
* [0x2] Constants raw data
* [0x3] Code: executable code
[1-byte]: operation
[variable]: operand (see value encoding below)
* [0x4] Debugging info
???

View File

@@ -55,6 +55,12 @@ void ByteArray::set_string(uint32_t addr, std::string const& str)
set_byte(addr, 0);
}
void ByteArray::set_bytearray(uint32_t addr, ByteArray const& bytearray)
{
for (uint8_t byte: bytearray.data())
set_byte(addr++, byte);
}
uint8_t ByteArray::get_byte(uint32_t addr) const
{
return data_.at(addr);
@@ -90,7 +96,7 @@ std::pair<int32_t, size_t> ByteArray::get_int(uint32_t addr) const
throw BytecodeParsingError("Error parsing int32 at position " + std::to_string(addr));
}
std::pair<float, size_t> ByteArray::get_float(uint32_t addr) const
float ByteArray::get_float(uint32_t addr) const
{
uint32_t bits = (uint32_t) get_byte(addr)
| (uint32_t) get_byte(addr+1) << 8
@@ -98,7 +104,7 @@ std::pair<float, size_t> ByteArray::get_float(uint32_t addr) const
| (uint32_t) get_byte(addr+3) << 24;
float value;
std::memcpy(&value, &bits, 4);
return { value, 4 };
return value;
}
std::pair<std::string, size_t> ByteArray::get_string(uint32_t addr) const

View File

@@ -19,6 +19,7 @@ public:
void set_int(uint32_t addr, int32_t value);
void set_float(uint32_t addr, float value);
void set_string(uint32_t addr, std::string const& str);
void set_bytearray(uint32_t addr, ByteArray const& bytearray);
void append_byte(uint8_t byte) { set_byte(data_.size(), byte); }
void append_uint16(uint16_t value) { set_uint16(data_.size(), value); }
@@ -32,7 +33,7 @@ public:
[[nodiscard]] uint16_t get_uint16(uint32_t addr) const;
[[nodiscard]] uint32_t get_uint32(uint32_t addr) const;
[[nodiscard]] std::pair<int32_t, size_t> get_int(uint32_t addr) const;
[[nodiscard]] std::pair<float, size_t> get_float(uint32_t addr) const;
[[nodiscard]] float get_float(uint32_t addr) const;
[[nodiscard]] std::pair<std::string, size_t> get_string(uint32_t addr) const;
[[nodiscard]] std::vector<uint8_t> const& data() const { return data_; }

View File

@@ -7,20 +7,24 @@ Bytecode::Bytecode(ByteArray ba)
: byte_array_(std::move(ba))
{
// check file size
if (byte_array_.size() < (HEADER_SZ + INDEX_SZ))
if (byte_array_.size() < (TOC_START + TOC_SZ))
throw BytecodeParsingError("Invalid bytecode format (file too short)");
// check magic number and version
if (byte_array_.get_uint32(0) != MAGIC)
if (byte_array_.get_uint32(0) != MAGIC_NUMBER)
throw BytecodeParsingError("Invalid bytecode format (magic number not matching)");
if (byte_array_.get_uint32(4) != VERSION)
if (byte_array_.get_uint32(4) != BYTECODE_VERSION)
throw BytecodeParsingError("Unexpected bytecode format version");
// load cache
cache_.constants_idx_addr = byte_array_.get_uint32(HEADER_SZ);
cache_.n_constants = byte_array_.get_uint16(HEADER_SZ + 4);
cache_.functions_idx_addr = byte_array_.get_uint32(HEADER_SZ + 6);
cache_.n_functions = byte_array_.get_uint16(HEADER_SZ + 10);
cache_.constants_idx_addr = byte_array_.get_uint32(TOC_START);
cache_.n_constants = byte_array_.get_uint16(TOC_START + 4);
cache_.constants_start_addr = byte_array_.get_uint32(TOC_START + (6 * 2));
cache_.functions_idx_addr = byte_array_.get_uint32(TOC_START + 6);
cache_.n_functions = byte_array_.get_uint16(TOC_START + 10);
uint32_t code_start = byte_array_.get_uint32(TOC_START + (6 * 3));
for (uint32_t i = 0; i < cache_.n_functions; ++i)
cache_.function_addr.emplace_back(code_start + byte_array_.get_uint32(cache_.functions_idx_addr + (i * 8)));
}
uint32_t Bytecode::n_constants() const
@@ -36,117 +40,111 @@ uint32_t Bytecode::n_functions() const
int32_t Bytecode::get_constant_int(uint32_t idx) const
{
uint32_t constant_idx = byte_array_.get_uint32(cache_.constants_idx_addr + (idx * 4));
return byte_array_.get_int(constant_idx).first;
return byte_array_.get_int(cache_.constants_start_addr + constant_idx).first;
}
float Bytecode::get_constant_float(uint32_t idx) const
{
return 0;
uint32_t constant_idx = byte_array_.get_uint32(cache_.constants_idx_addr + (idx * 4));
return byte_array_.get_float(cache_.constants_start_addr + constant_idx);
}
std::string Bytecode::get_constant_string(uint32_t idx) const
{
return std::string();
uint32_t constant_idx = byte_array_.get_uint32(cache_.constants_idx_addr + (idx * 4));
return byte_array_.get_string(cache_.constants_start_addr + constant_idx).first;
}
Bytecode::FunctionDef Bytecode::get_function_def(uint32_t function_id) const
{
return Bytecode::FunctionDef();
uint32_t idx = cache_.functions_idx_addr + (function_id * 8);
return {
.n_params = byte_array_.get_uint16(idx + 4),
.locals = byte_array_.get_uint16(idx + 6),
};
}
uint8_t Bytecode::get_code_byte(uint32_t function_id, uint32_t idx) const
{
return 0;
return byte_array_.get_byte(cache_.function_addr.at(function_id) + idx);
}
int32_t Bytecode::get_code_int(uint32_t function_id, uint32_t idx) const
std::pair<int32_t, size_t> Bytecode::get_code_int(uint32_t function_id, uint32_t idx) const
{
return 0;
return byte_array_.get_int(cache_.function_addr.at(function_id) + idx);
}
float Bytecode::get_code_float(uint32_t function_id, uint32_t idx) const
{
return 0;
return byte_array_.get_float(cache_.function_addr.at(function_id) + idx);
}
ByteArray Bytecode::generate(BytecodePrototype const& bp)
{
ByteArray ba;
// header section
ByteArray header;
header.set_uint32(0, MAGIC_NUMBER);
header.set_byte(4, BYTECODE_VERSION);
// constants
std::vector<uint32_t> constant_table;
ByteArray constant_array;
ByteArray constant_indexes;
ByteArray raw_constants;
uint32_t idx = 0;
for (auto const& constant: bp.constants) {
constant_table.emplace_back(idx);
constant_indexes.append_uint32(idx);
std::visit(overloaded {
[&](int32_t i) { constant_array.append_int(i); },
[&](float f) { constant_array.append_float(f); },
[&](std::string const& s) { constant_array.append_string(s); },
[&](int32_t i) { raw_constants.append_int(i); },
[&](float f) { raw_constants.append_float(f); },
[&](std::string const& s) { raw_constants.append_string(s); },
}, constant);
idx = constant_array.size();
}
// function table
std::vector<std::pair<FunctionDef, uint32_t>> functions;
ByteArray code;
for (auto const& f: bp.functions) {
code.append_bytearray(f.code);
functions.emplace_back(std::make_pair(FunctionDef { f.n_pars, f.n_locals }, code.size()));
}
//
// build binary
//
// header
ba.set_uint32(0, MAGIC);
ba.set_byte(4, VERSION);
// constants
idx = HEADER_SZ + INDEX_SZ;
for (auto const& const_idx: constant_table) {
ba.set_uint32(idx, const_idx);
idx += 4;
}
uint32_t constants_raw_data_addr = ba.size();
ba.append_bytearray(constant_array);
// constant index
if (!constant_table.empty()) {
ba.set_uint32(HEADER_SZ, HEADER_SZ + INDEX_SZ);
ba.set_uint16(HEADER_SZ + 4, constant_table.size());
idx = raw_constants.size();
}
// functions
size_t functions_start = idx + (constant_table.size() * 4);
idx = functions_start;
uint32_t code_idx = 0;
for (auto const& f: functions) {
ba.set_uint32(idx, code_idx);
ba.set_uint16(idx + 4, f.first.n_params);
ba.set_uint16(idx + 6, f.first.locals);
idx += 8;
code_idx += f.second;
}
uint32_t functions_raw_data_addr = ba.size();
for (auto const& f: bp.functions)
ba.append_bytearray(f.code);
ByteArray functions_indexes;
ByteArray raw_code;
// function index
if (!functions.empty()) {
ba.set_uint32(HEADER_SZ + 6, functions_start);
ba.set_uint16(HEADER_SZ + 6 + 4, functions.size());
uint32_t idx_idx = 0, code_idx = 0;
for (auto const& f: bp.functions) {
functions_indexes.set_uint32(idx_idx, code_idx);
functions_indexes.set_uint16(idx_idx + 4, f.n_pars);
functions_indexes.set_uint16(idx_idx + 6, f.n_locals);
raw_code.append_bytearray(f.code);
code_idx = raw_code.size();
idx_idx += FUNCTION_RECORD_SZ;
}
// constants raw data
if (!constant_table.empty())
ba.set_uint32(HEADER_SZ + (2 * 6), constants_raw_data_addr);
// table of contents
uint32_t function_idx_start = CONST_IDX_START + constant_indexes.size();
uint32_t raw_constant_start = function_idx_start + functions_indexes.size();
uint32_t raw_code_start = raw_constant_start + raw_constants.size();
// function raw data
if (!functions.empty())
ba.set_uint32(HEADER_SZ + (3 * 6), functions_raw_data_addr);
ByteArray toc;
if (!bp.constants.empty()) {
toc.set_uint32(SEC_CONST_IDX * TOC_RECORD_SZ, CONST_IDX_START);
toc.set_uint32(SEC_CONST_IDX * TOC_RECORD_SZ + 4, constant_indexes.size() / CONST_RECORD_SZ);
toc.set_uint32(SEC_CONST_DATA * TOC_RECORD_SZ, raw_constant_start);
toc.set_uint32(SEC_CONST_DATA * TOC_RECORD_SZ + 4, raw_constants.size());
}
if (!bp.functions.empty()) {
toc.set_uint32(SEC_FUNC_IDX * TOC_RECORD_SZ, function_idx_start);
toc.set_uint32(SEC_FUNC_IDX * TOC_RECORD_SZ + 4, functions_indexes.size() / FUNCTION_RECORD_SZ);
toc.set_uint32(SEC_CODE * TOC_RECORD_SZ, raw_code_start);
toc.set_uint32(SEC_CODE * TOC_RECORD_SZ + 4, raw_code.size());
}
//
// assemble bytecode
//
ByteArray ba;
ba.set_bytearray(0, header);
ba.set_bytearray(TOC_START, toc);
ba.set_bytearray(CONST_IDX_START, constant_indexes);
ba.set_bytearray(function_idx_start, functions_indexes);
ba.set_bytearray(raw_constant_start, raw_constants);
ba.set_bytearray(raw_code_start, raw_code);
return ba;
}

View File

@@ -20,9 +20,9 @@ public:
struct FunctionDef { uint16_t n_params, locals; };
[[nodiscard]] FunctionDef get_function_def(uint32_t function_id) const;
[[nodiscard]] uint8_t get_code_byte(uint32_t function_id, uint32_t idx) const;
[[nodiscard]] int32_t get_code_int(uint32_t function_id, uint32_t idx) const;
[[nodiscard]] float get_code_float(uint32_t function_id, uint32_t idx) const;
[[nodiscard]] uint8_t get_code_byte(uint32_t function_id, uint32_t idx) const;
[[nodiscard]] std::pair<int32_t, size_t> get_code_int(uint32_t function_id, uint32_t idx) const;
[[nodiscard]] float get_code_float(uint32_t function_id, uint32_t idx) const;
// TODO - debugging info
@@ -31,10 +31,17 @@ public:
private:
ByteArray byte_array_; // the actual data
static constexpr uint8_t VERSION = 1;
static constexpr uint32_t MAGIC = 0x74b3c138;
static constexpr uint32_t HEADER_SZ = 16,
INDEX_SZ = 8 * 6;
static constexpr uint8_t BYTECODE_VERSION = 1;
static constexpr uint32_t MAGIC_NUMBER = 0x74b3c138;
static constexpr uint32_t TOC_START = 16,
TOC_N_RECORDS = 8,
TOC_RECORD_SZ = 8,
TOC_SZ = TOC_N_RECORDS * TOC_RECORD_SZ;
static constexpr uint32_t CONST_IDX_START = TOC_START + TOC_SZ,
CONST_RECORD_SZ = 4;
static constexpr uint32_t FUNCTION_RECORD_SZ = 8;
enum Sections { SEC_CONST_IDX = 0, SEC_FUNC_IDX = 1, SEC_CONST_DATA = 2, SEC_CODE = 3 };
// caching for faster reading of data
struct Cache {
@@ -43,7 +50,7 @@ private:
uint32_t constants_start_addr;
uint32_t functions_idx_addr;
uint32_t n_functions;
uint32_t functions_start_addr;
std::vector<uint32_t> function_addr;
};
Cache cache_ {};
};

View File

@@ -44,9 +44,9 @@ TEST(ByteArray, ByteArray)
ba.set_int(1, 5000300); ASSERT_EQ(ba.get_int(1), std::make_pair(5000300, 4));
ba.set_int(1, -5000300); ASSERT_EQ(ba.get_int(1), std::make_pair(-5000300, 4));
ba.set_float(1, 3.14); ASSERT_FLOAT_EQ(ba.get_float(1).first, 3.14);
ba.set_float(1, -3.14); ASSERT_FLOAT_EQ(ba.get_float(1).first, -3.14);
ba.set_float(1, -5000300.1324); ASSERT_FLOAT_EQ(ba.get_float(1).first, -5000300.1324);
ba.set_float(1, 3.14); ASSERT_FLOAT_EQ(ba.get_float(1), 3.14);
ba.set_float(1, -3.14); ASSERT_FLOAT_EQ(ba.get_float(1), -3.14);
ba.set_float(1, -5000300.1324); ASSERT_FLOAT_EQ(ba.get_float(1), -5000300.1324);
ba.set_string(1, "Hello world!"); ASSERT_EQ(ba.get_string(1), std::make_pair("Hello world!", 13));
@@ -67,14 +67,14 @@ TEST(Bytecode, Constants)
0x00, 0x00, 0x00, 0x00,
// index
0x40, 0x00, 0x00, 0x00, 0x02, 0x00, // constants
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // functions
0x48, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x50, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, // constant index
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // function undex
0x58, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, // raw constants
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // raw code
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// constant indexes
0x00, 0x00, 0x00, 0x00,
@@ -85,6 +85,7 @@ TEST(Bytecode, Constants)
};
ByteArray ba = Bytecode::generate(bp);
print(ba.data()); print(expected);
ASSERT_EQ(ba.data(), expected);
}
@@ -106,14 +107,14 @@ TEST(Bytecode, Code)
0x00, 0x00, 0x00, 0x00,
// index
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // constants
0x40, 0x00, 0x00, 0x00, 0x02, 0x00, // variables
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x50, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // constant index
0x50, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, // variable index
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // raw constants
0x60, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, // raw code
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// function definitions
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -142,8 +143,8 @@ TEST(Bytecode, Parsing)
f.code.append_byte(0x68);
f.code.append_int(42);
auto& f2 = bp.functions.emplace_back(2, 1);
f2.code.append_byte(0x42);
auto& ff = bp.functions.emplace_back(2, 1);
ff.code.append_byte(0x42);
ByteArray ba = Bytecode::generate(bp);
print(ba.data());
@@ -158,6 +159,18 @@ TEST(Bytecode, Parsing)
ASSERT_EQ(bc.get_constant_int(0), 42);
ASSERT_FLOAT_EQ(bc.get_constant_float(1), 3.14f);
ASSERT_EQ(bc.get_constant_string(2), "HELLO");
Bytecode::FunctionDef f1 = bc.get_function_def(0);
ASSERT_EQ(f1.n_params, 0);
ASSERT_EQ(f1.locals, 0);
Bytecode::FunctionDef f2 = bc.get_function_def(1);
ASSERT_EQ(f2.n_params, 2);
ASSERT_EQ(f2.locals, 1);
ASSERT_EQ(bc.get_code_byte(0, 0), 0x68);
ASSERT_EQ(bc.get_code_int(0, 1), std::make_pair(42, 1));
ASSERT_EQ(bc.get_code_byte(1, 0), 0x42);
}
int main(int argc, char** argv)