cjit update

This commit is contained in:
2026-01-22 22:30:58 +01:00
parent eb3c3814ec
commit 868550703f
7 changed files with 530 additions and 149 deletions

View File

@@ -1,149 +1,6 @@
pub const Keyword = enum {
_Alignas,
_Alignof,
_Atomic,
_Bool,
_Complex,
_Generic,
_Imaginary,
_Noreturn,
_Static_assert,
_Thread_local,
auto,
@"break",
case,
char,
@"const",
@"continue",
default,
do,
double,
@"else",
@"enum",
@"extern",
float,
@"for",
goto,
@"if",
@"inline",
int,
long,
register,
restrict,
@"return",
short,
signed,
sizeof,
static,
@"struct",
@"switch",
typedef,
@"union",
unsigned,
void,
@"volatile",
@"while",
};
pub const Identifier = struct {
name: []const u8,
};
pub const Constant = union(enum) {
int: i32,
long: i64,
long_long: i64,
unsigned_int: u32,
unsigned_long: u64,
unsigned_long_long: u64,
float: f32,
double: f64,
character: u8,
wide_character: u32,
};
pub const StringLiteral = struct {
value: []const u8,
};
pub const Punctuator = enum {
// three characters
@"...",
@"<<=",
@">>=",
// two characters
@"--",
@"-=",
@"->",
@"!=",
@"*=",
@"/=",
@"&&",
@"&=",
@"##",
@"%=",
@"^=",
@"++",
@"+=",
@"<<",
@"<=",
@"==",
@">=",
@">>",
@"|=",
@"||",
// single character
@"-",
@",",
@";",
@":",
@"!",
@"?",
@".",
@"(",
@")",
@"[",
@"]",
@"{",
@"}",
@"*",
@"/",
@"&",
@"#",
@"%",
@"^",
@"+",
@"<",
@"=",
@">",
@"|",
@"~",
};
pub const Token = union(enum) {
keyword: Keyword,
identifier: []const u8,
constant: Constant,
string_literal: []const u8,
wide_string_literal: []const u32,
punctuator: Punctuator,
};
pub fn isIdentifierStart(code_point: u21) bool {
// zig fmt: off
return code_point >= 'A' and code_point <= 'Z'
or code_point == '_'
or code_point >= 'a' and code_point <= 'z'
or code_point >= 128;
// zig fmt: on
}
pub fn isIdentifierMiddle(code_point: u21) bool {
// zig fmt: off
return code_point >= '0' and code_point <= '9'
or code_point >= 'A' and code_point <= 'Z'
or code_point == '_'
or code_point >= 'a' and code_point <= 'z'
or code_point >= 128;
// zig fmt: on
}
pub const Constant = @import("tokens/Constant.zig");
pub const Keyword = @import("tokens/Keyword.zig");
pub const Punctuator = @import("tokens/Punctuator.zig");
pub const Token = @import("tokens/Token.zig");
pub const Tokenizer = @import("tokens/Tokenizer.zig");
pub const Utf8Iterator = @import("tokens/Utf8Iterator.zig");

View File

@@ -0,0 +1,14 @@
const std = @import("std");
pub const Constant = union(enum) {
int: i32,
long: i64,
long_long: i64,
unsigned_int: u32,
unsigned_long: u64,
unsigned_long_long: u64,
float: f32,
double: f64,
character: u8,
wide_character: i32,
};

View File

@@ -0,0 +1,63 @@
const std = @import("std");
pub const Keyword = enum {
_Alignas,
_Alignof,
_Atomic,
_Bool,
_Complex,
_Generic,
_Imaginary,
_Noreturn,
_Static_assert,
_Thread_local,
auto,
@"break",
case,
char,
@"const",
@"continue",
default,
do,
double,
@"else",
@"enum",
@"extern",
float,
@"for",
goto,
@"if",
@"inline",
int,
long,
register,
restrict,
@"return",
short,
signed,
sizeof,
static,
@"struct",
@"switch",
typedef,
@"union",
unsigned,
void,
@"volatile",
@"while",
pub const map: std.StaticStringMap(Keyword) = blk: {
const fields = @typeInfo(Keyword).@"enum".fields;
var kvs_list: [fields.len]struct { []const u8, Keyword } = undefined;
for (fields, 0..) |field, i| {
kvs_list[i] = .{ field.name, @field(Keyword, field.name) };
}
break :blk .initComptime(kvs_list);
};
pub fn isKeyword(identifier: []const u8) ?Keyword {
return map.get(identifier);
}
};

View File

@@ -0,0 +1,70 @@
const std = @import("std");
pub const Punctuator = enum(u32) {
// three characters
@"..." = strToInt3("..."),
@"<<=" = strToInt3("<<="),
@">>=" = strToInt3(">>="),
// two characters
@"--" = strToInt2("--"),
@"-=" = strToInt2("-="),
@"->" = strToInt2("->"),
@"!=" = strToInt2("!="),
@"*=" = strToInt2("*="),
@"/=" = strToInt2("/="),
@"&&" = strToInt2("&&"),
@"&=" = strToInt2("&="),
@"##" = strToInt2("##"),
@"%=" = strToInt2("%="),
@"^=" = strToInt2("^="),
@"++" = strToInt2("++"),
@"+=" = strToInt2("+="),
@"<<" = strToInt2("<<"),
@"<=" = strToInt2("<="),
@"==" = strToInt2("=="),
@">=" = strToInt2(">="),
@">>" = strToInt2(">>"),
@"|=" = strToInt2("|="),
@"||" = strToInt2("||"),
// single character
@"-" = strToInt1("-"),
@"," = strToInt1(","),
@";" = strToInt1(";"),
@":" = strToInt1(":"),
@"!" = strToInt1("!"),
@"?" = strToInt1("?"),
@"." = strToInt1("."),
@"(" = strToInt1("("),
@")" = strToInt1(")"),
@"[" = strToInt1("["),
@"]" = strToInt1("]"),
@"{" = strToInt1("{"),
@"}" = strToInt1("}"),
@"*" = strToInt1("*"),
@"/" = strToInt1("/"),
@"&" = strToInt1("&"),
@"#" = strToInt1("#"),
@"%" = strToInt1("%"),
@"^" = strToInt1("^"),
@"+" = strToInt1("+"),
@"<" = strToInt1("<"),
@"=" = strToInt1("="),
@">" = strToInt1(">"),
@"|" = strToInt1("|"),
@"~" = strToInt1("~"),
pub const line_continuation_lf = strToInt2("\\\n");
pub const line_continuation_crlf = strToInt3("\\\r\n");
};
pub fn strToInt1(str: *const u8[1]) u32 {
return @as(u8, @bitCast(str.*));
}
pub fn strToInt2(str: *const u8[2]) u32 {
return @as(u16, @bitCast(str.*));
}
pub fn strToInt3(str: *const u8[3]) u32 {
return @as(u24, @bitCast(str.*));
}

View File

@@ -0,0 +1,14 @@
const std = @import("std");
pub const Constant = @import("Constant.zig");
pub const Keyword = @import("Keyword.zig");
pub const Punctuator = @import("Punctuator.zig");
pub const Token = union(enum) {
keyword: Keyword,
identifier: []const u8,
constant: Constant,
string_literal: [:0]const u8,
wide_string_literal: [:0]const u32,
punctuator: Punctuator,
};

View File

@@ -0,0 +1,260 @@
const std = @import("std");
const Self = @This();
const Keyword = @import("Keyword.zig").Keyword;
const Punctuator = @import("Punctuator.zig").Punctuator;
const Token = @import("Token.zig").Token;
const Utf8Iterator = @import("Utf8Iterator.zig");
pub const max_string_length = 4096;
pub const max_wide_string_length = 4096;
filename: []const u8,
it: Utf8Iterator,
defines: std.StringHashMapUnmanaged([]Token) = .{},
/// Bounded, preallocated with the capacity of `max_string_length`.
string: std.ArrayList(u8),
/// Bounded, preallocated with the capacity of `max_wide_string_length`.
wide_string: std.ArrayList(u32),
pub fn init(filename: []const u8, code: []const u8, arena_allocator: std.mem.Allocator) !Self {
const string_buffer = try arena_allocator.alloc(u8, max_string_length);
const wide_string_buffer = try arena_allocator.alloc(u8, max_wide_string_length);
return .{
.filename = filename,
.it = .init(code),
.string = .initBuffer(string_buffer),
.wide_string = .initBuffer(wide_string_buffer),
};
}
pub fn setSource(self: *Self, filename: []const u8, code: []const u8) void {
self.filename = filename;
self.it = .init(code);
}
pub fn nextToken(self: *Self, arena_allocator: std.mem.Allocator) !?Token {
self.skipWhitespace();
// TODO Skip C and C++ style comments
// TODO Preprocessor directives
const cp = try self.peekCodepointSkipLineContinuation() orelse return null;
switch (cp) {
// Identifier start
'A'...'Z', '_', 'a'...'z', 128...std.math.maxInt(u21) => {
// This is an identifier, with the possible exception of:
// - wide string: L"
// - wide char: L'
// - any keyword
if (cp == 'L') {
const state = self.it.save();
self.it.advanceCodepoint(cp);
const cp2 = self.peekCodepointSkipLineContinuation() orelse 0;
switch (cp2) {
// Wide string
'\"' => {
self.it.advanceCodepoint(cp2);
self.wide_string.clearRetainingCapacity();
// TODO Parse wide string
},
// Wide char
'\'' => {
self.it.advanceCodepoint(cp2);
// TODO Parse wide char
},
// Identifier or keyword
else => {
self.it.restore(state);
},
}
}
const identifier_start = self.it.ptr;
self.it.advanceCodepoint(cp);
var next_cp = try self.peekCodepointSkipLineContinuation();
while (next_cp != null and isIdentifierMiddle(next_cp.?)) {
self.it.advanceCodepoint(next_cp.?);
next_cp = try self.peekCodepointSkipLineContinuation();
}
const identifier = self.str[identifier_start..self.it.ptr];
if (Keyword.isKeyword(identifier)) |keyword| {
return .{ .keyword = keyword };
} else {
// TODO Preprocessor
return .{ .identifier = arena_allocator.dupe(u8, identifier) };
}
},
// String
'\"' => {
self.it.advanceCodepoint(cp);
self.string.clearRetainingCapacity();
// TODO Parse string
},
// Char
'\'' => {
self.it.advanceCodepoint(cp);
// TODO Parse char
},
}
// Higher code points should've been already handled. The code below may
// assume that `cp` is an ASCII character.
std.debug.assert(cp < 128);
// TODO Numeric constants
const cp3 = self.it.peekThreeBytes().?;
switch (cp3 & 0x00_FF_FF_FF) {
inline @intFromEnum(Punctuator.@"..."),
@intFromEnum(Punctuator.@"<<="),
@intFromEnum(Punctuator.@">>="),
=> |p| {
self.it.ptr += 3;
self.it.col += 3;
return .{
.punctuator = @enumFromInt(p),
};
},
else => {},
}
switch (cp3 & 0x00_00_FF_FF) {
inline @intFromEnum(Punctuator.@"--"),
@intFromEnum(Punctuator.@"-="),
@intFromEnum(Punctuator.@"->"),
@intFromEnum(Punctuator.@"!="),
@intFromEnum(Punctuator.@"*="),
@intFromEnum(Punctuator.@"/="),
@intFromEnum(Punctuator.@"&&"),
@intFromEnum(Punctuator.@"&="),
@intFromEnum(Punctuator.@"##"),
@intFromEnum(Punctuator.@"%="),
@intFromEnum(Punctuator.@"^="),
@intFromEnum(Punctuator.@"++"),
@intFromEnum(Punctuator.@"+="),
@intFromEnum(Punctuator.@"<<"),
@intFromEnum(Punctuator.@"<="),
@intFromEnum(Punctuator.@"=="),
@intFromEnum(Punctuator.@">="),
@intFromEnum(Punctuator.@">>"),
@intFromEnum(Punctuator.@"|="),
@intFromEnum(Punctuator.@"||"),
=> |p| {
self.it.ptr += 2;
self.it.col += 2;
return .{
.punctuator = @enumFromInt(p),
};
},
else => {},
}
switch (cp3 & 0x00_00_00_FF) {
inline @intFromEnum(Punctuator.@"-"),
@intFromEnum(Punctuator.@","),
@intFromEnum(Punctuator.@";"),
@intFromEnum(Punctuator.@":"),
@intFromEnum(Punctuator.@"!"),
@intFromEnum(Punctuator.@"?"),
@intFromEnum(Punctuator.@"."),
@intFromEnum(Punctuator.@"("),
@intFromEnum(Punctuator.@")"),
@intFromEnum(Punctuator.@"["),
@intFromEnum(Punctuator.@"]"),
@intFromEnum(Punctuator.@"{"),
@intFromEnum(Punctuator.@"}"),
@intFromEnum(Punctuator.@"*"),
@intFromEnum(Punctuator.@"/"),
@intFromEnum(Punctuator.@"&"),
@intFromEnum(Punctuator.@"#"),
@intFromEnum(Punctuator.@"%"),
@intFromEnum(Punctuator.@"^"),
@intFromEnum(Punctuator.@"+"),
@intFromEnum(Punctuator.@"<"),
@intFromEnum(Punctuator.@"="),
@intFromEnum(Punctuator.@">"),
@intFromEnum(Punctuator.@"|"),
@intFromEnum(Punctuator.@"~"),
=> |p| {
self.it.ptr += 1;
self.it.col += 1;
return .{
.punctuator = @enumFromInt(p),
};
},
else => {},
}
return error.InvalidToken;
}
fn peekCodepointSkipLineContinuation(self: *Self) !?u21 {
while (self.skipLineContinuation()) {}
const cp = try self.it.peekCodepoint();
return cp;
}
/// Line continuation is defined as a backslash followed imediatelly by LF or
/// CRLF. Return whether a line continuation was encountered and therefore
/// skipped past.
fn skipLineContinuation(self: *Self) bool {
if (self.it.peekThreeBytes()) |b| {
@branchHint(.likely);
if (b & 0x00_00_FF_FF == Punctuator.line_continuation_lf) {
@branchHint(.unlikely);
self.it.ptr += 2;
self.it.line += 1;
self.it.col = 1;
return true;
} else if (b & 0x00_FF_FF_FF == Punctuator.line_continuation_crlf) {
@branchHint(.unlikely);
self.it.ptr += 3;
self.it.line += 1;
self.it.col = 1;
return true;
}
}
return false;
}
fn skipWhitespace(self: *Self) !void {
while (try self.peekCodepointSkipLineContinuation()) |cp| {
switch (cp) {
// <Character Tabulation> (HT, TAB)
0x0009,
// <End of Line> (EOL, LF, NL)
0x000A,
// <Line Tabulation> (VT)
0x000B,
// <Form Feed> (FF)
0x000C,
// <Carriage Return> (CR)
0x000D,
// Space (SP)
0x0020,
=> try self.it.advanceCodepoint(cp),
else => return,
}
}
}
fn isIdentifierMiddle(code_point: u21) bool {
// zig fmt: off
return code_point >= '0' and code_point <= '9'
or code_point >= 'A' and code_point <= 'Z'
or code_point == '_'
or code_point >= 'a' and code_point <= 'z'
or code_point >= 128;
// zig fmt: on
}

View File

@@ -0,0 +1,103 @@
const std = @import("std");
const Self = @This();
str: []const u8,
ptr: usize,
line: usize,
col: usize,
pub const State = struct {
ptr: usize,
line: usize,
col: usize,
};
pub fn init(str: []const u8) Self {
return .{
.str = str,
.ptr = 0,
.line = 1,
.col = 1,
};
}
pub fn save(self: Self) State {
return .{
.ptr = self.ptr,
.line = self.line,
.col = self.col,
};
}
pub fn restore(self: *Self, state: State) void {
self.ptr = state.ptr;
self.line = state.line;
self.col = state.col;
}
pub fn peekByte(self: *Self) ?u8 {
if (self.ptr >= self.str.len) {
return null;
}
return self.str[self.ptr];
}
pub fn peekCodepoint(self: Self) !?u21 {
if (self.ptr >= self.str.len) {
return null;
}
const cp_len = std.unicode.utf8ByteSequenceLength(self.str[self.ptr]) catch return error.InvalidUtf8;
if (self.ptr + cp_len > self.str.len) return error.InvalidUtf8;
const cp_slice = self.str[self.ptr .. self.ptr + cp_len];
const cp = std.unicode.utf8Decode(cp_slice) catch return error.InvalidUtf8;
return cp;
}
pub fn peekThreeBytes(self: Self) ?u32 {
var bytes: [3]u8 = .{ 0, 0, 0 };
const bytes_left = self.str.len - self.ptr;
sw: switch (bytes_left) {
0 => return null,
1 => {
bytes[0] = self.str[self.ptr];
return @as(u24, @bitCast(bytes));
},
2 => {
bytes[1] = self.str[self.ptr + 1];
continue :sw 1;
},
else => {
bytes[2] = self.str[self.ptr + 2];
continue :sw 2;
},
}
}
pub fn advanceAsciiBytes(self: *Self, bytes: usize) void {
std.debug.assert(self.str.ptr + bytes <= self.str.len);
}
/// Call with value returned by `peekCodepoint`.
pub fn advanceCodepoint(self: *Self, cp: u21) void {
std.debug.assert(blk: {
const actual_cp = self.peekCodepoint() catch break :blk false;
break :blk cp == actual_cp;
});
const cp_len = std.unicode.utf8CodepointSequenceLength(cp) catch unreachable;
self.ptr += cp_len;
if (cp == '\n') {
self.line += 1;
// NOTE Columns start as 1, it will be incremented below.
self.col = 0;
}
self.col += 1;
}