cjit update

2026-01-22 22:30:58 +01:00
parent eb3c3814ec
commit 868550703f
7 changed files with 530 additions and 149 deletions
--- a/packages/cjit/src/tokens.zig
+++ b/packages/cjit/src/tokens.zig
@@ -1,149 +1,6 @@
-pub const Keyword = enum {
-    _Alignas,
-    _Alignof,
-    _Atomic,
-    _Bool,
-    _Complex,
-    _Generic,
-    _Imaginary,
-    _Noreturn,
-    _Static_assert,
-    _Thread_local,
-    auto,
-    @"break",
-    case,
-    char,
-    @"const",
-    @"continue",
-    default,
-    do,
-    double,
-    @"else",
-    @"enum",
-    @"extern",
-    float,
-    @"for",
-    goto,
-    @"if",
-    @"inline",
-    int,
-    long,
-    register,
-    restrict,
-    @"return",
-    short,
-    signed,
-    sizeof,
-    static,
-    @"struct",
-    @"switch",
-    typedef,
-    @"union",
-    unsigned,
-    void,
-    @"volatile",
-    @"while",
-};
-
-pub const Identifier = struct {
-    name: []const u8,
-};
-
-pub const Constant = union(enum) {
-    int: i32,
-    long: i64,
-    long_long: i64,
-    unsigned_int: u32,
-    unsigned_long: u64,
-    unsigned_long_long: u64,
-    float: f32,
-    double: f64,
-    character: u8,
-    wide_character: u32,
-};
-
-pub const StringLiteral = struct {
-    value: []const u8,
-};
-
-pub const Punctuator = enum {
-    // three characters
-    @"...",
-    @"<<=",
-    @">>=",
-    // two characters
-    @"--",
-    @"-=",
-    @"->",
-    @"!=",
-    @"*=",
-    @"/=",
-    @"&&",
-    @"&=",
-    @"##",
-    @"%=",
-    @"^=",
-    @"++",
-    @"+=",
-    @"<<",
-    @"<=",
-    @"==",
-    @">=",
-    @">>",
-    @"|=",
-    @"||",
-    // single character
-    @"-",
-    @",",
-    @";",
-    @":",
-    @"!",
-    @"?",
-    @".",
-    @"(",
-    @")",
-    @"[",
-    @"]",
-    @"{",
-    @"}",
-    @"*",
-    @"/",
-    @"&",
-    @"#",
-    @"%",
-    @"^",
-    @"+",
-    @"<",
-    @"=",
-    @">",
-    @"|",
-    @"~",
-};
-
-pub const Token = union(enum) {
-    keyword: Keyword,
-    identifier: []const u8,
-    constant: Constant,
-    string_literal: []const u8,
-    wide_string_literal: []const u32,
-    punctuator: Punctuator,
-};
-
-pub fn isIdentifierStart(code_point: u21) bool {
-    // zig fmt: off
-    return code_point >= 'A' and code_point <= 'Z'
-        or code_point == '_'
-        or code_point >= 'a' and code_point <= 'z'
-        or code_point >= 128;
-    // zig fmt: on
-}
-
-pub fn isIdentifierMiddle(code_point: u21) bool {
-    // zig fmt: off
-    return code_point >= '0' and code_point <= '9'
-        or code_point >= 'A' and code_point <= 'Z'
-        or code_point == '_'
-        or code_point >= 'a' and code_point <= 'z'
-        or code_point >= 128;
-    // zig fmt: on
-}
+pub const Constant = @import("tokens/Constant.zig");
+pub const Keyword = @import("tokens/Keyword.zig");
+pub const Punctuator = @import("tokens/Punctuator.zig");
+pub const Token = @import("tokens/Token.zig");
+pub const Tokenizer = @import("tokens/Tokenizer.zig");
+pub const Utf8Iterator = @import("tokens/Utf8Iterator.zig");
--- a/packages/cjit/src/tokens/Constant.zig
+++ b/packages/cjit/src/tokens/Constant.zig
@@ -0,0 +1,14 @@
+const std = @import("std");
+
+pub const Constant = union(enum) {
+    int: i32,
+    long: i64,
+    long_long: i64,
+    unsigned_int: u32,
+    unsigned_long: u64,
+    unsigned_long_long: u64,
+    float: f32,
+    double: f64,
+    character: u8,
+    wide_character: i32,
+};
--- a/packages/cjit/src/tokens/Keyword.zig
+++ b/packages/cjit/src/tokens/Keyword.zig
@@ -0,0 +1,63 @@
+const std = @import("std");
+
+pub const Keyword = enum {
+    _Alignas,
+    _Alignof,
+    _Atomic,
+    _Bool,
+    _Complex,
+    _Generic,
+    _Imaginary,
+    _Noreturn,
+    _Static_assert,
+    _Thread_local,
+    auto,
+    @"break",
+    case,
+    char,
+    @"const",
+    @"continue",
+    default,
+    do,
+    double,
+    @"else",
+    @"enum",
+    @"extern",
+    float,
+    @"for",
+    goto,
+    @"if",
+    @"inline",
+    int,
+    long,
+    register,
+    restrict,
+    @"return",
+    short,
+    signed,
+    sizeof,
+    static,
+    @"struct",
+    @"switch",
+    typedef,
+    @"union",
+    unsigned,
+    void,
+    @"volatile",
+    @"while",
+
+    pub const map: std.StaticStringMap(Keyword) = blk: {
+        const fields = @typeInfo(Keyword).@"enum".fields;
+
+        var kvs_list: [fields.len]struct { []const u8, Keyword } = undefined;
+        for (fields, 0..) |field, i| {
+            kvs_list[i] = .{ field.name, @field(Keyword, field.name) };
+        }
+
+        break :blk .initComptime(kvs_list);
+    };
+
+    pub fn isKeyword(identifier: []const u8) ?Keyword {
+        return map.get(identifier);
+    }
+};
--- a/packages/cjit/src/tokens/Punctuator.zig
+++ b/packages/cjit/src/tokens/Punctuator.zig
@@ -0,0 +1,70 @@
+const std = @import("std");
+
+pub const Punctuator = enum(u32) {
+    // three characters
+    @"..." = strToInt3("..."),
+    @"<<=" = strToInt3("<<="),
+    @">>=" = strToInt3(">>="),
+    // two characters
+    @"--" = strToInt2("--"),
+    @"-=" = strToInt2("-="),
+    @"->" = strToInt2("->"),
+    @"!=" = strToInt2("!="),
+    @"*=" = strToInt2("*="),
+    @"/=" = strToInt2("/="),
+    @"&&" = strToInt2("&&"),
+    @"&=" = strToInt2("&="),
+    @"##" = strToInt2("##"),
+    @"%=" = strToInt2("%="),
+    @"^=" = strToInt2("^="),
+    @"++" = strToInt2("++"),
+    @"+=" = strToInt2("+="),
+    @"<<" = strToInt2("<<"),
+    @"<=" = strToInt2("<="),
+    @"==" = strToInt2("=="),
+    @">=" = strToInt2(">="),
+    @">>" = strToInt2(">>"),
+    @"|=" = strToInt2("|="),
+    @"||" = strToInt2("||"),
+    // single character
+    @"-" = strToInt1("-"),
+    @"," = strToInt1(","),
+    @";" = strToInt1(";"),
+    @":" = strToInt1(":"),
+    @"!" = strToInt1("!"),
+    @"?" = strToInt1("?"),
+    @"." = strToInt1("."),
+    @"(" = strToInt1("("),
+    @")" = strToInt1(")"),
+    @"[" = strToInt1("["),
+    @"]" = strToInt1("]"),
+    @"{" = strToInt1("{"),
+    @"}" = strToInt1("}"),
+    @"*" = strToInt1("*"),
+    @"/" = strToInt1("/"),
+    @"&" = strToInt1("&"),
+    @"#" = strToInt1("#"),
+    @"%" = strToInt1("%"),
+    @"^" = strToInt1("^"),
+    @"+" = strToInt1("+"),
+    @"<" = strToInt1("<"),
+    @"=" = strToInt1("="),
+    @">" = strToInt1(">"),
+    @"|" = strToInt1("|"),
+    @"~" = strToInt1("~"),
+
+    pub const line_continuation_lf = strToInt2("\\\n");
+    pub const line_continuation_crlf = strToInt3("\\\r\n");
+};
+
+pub fn strToInt1(str: *const u8[1]) u32 {
+    return @as(u8, @bitCast(str.*));
+}
+
+pub fn strToInt2(str: *const u8[2]) u32 {
+    return @as(u16, @bitCast(str.*));
+}
+
+pub fn strToInt3(str: *const u8[3]) u32 {
+    return @as(u24, @bitCast(str.*));
+}
--- a/packages/cjit/src/tokens/Token.zig
+++ b/packages/cjit/src/tokens/Token.zig
@@ -0,0 +1,14 @@
+const std = @import("std");
+
+pub const Constant = @import("Constant.zig");
+pub const Keyword = @import("Keyword.zig");
+pub const Punctuator = @import("Punctuator.zig");
+
+pub const Token = union(enum) {
+    keyword: Keyword,
+    identifier: []const u8,
+    constant: Constant,
+    string_literal: [:0]const u8,
+    wide_string_literal: [:0]const u32,
+    punctuator: Punctuator,
+};
--- a/packages/cjit/src/tokens/Tokenizer.zig
+++ b/packages/cjit/src/tokens/Tokenizer.zig
@@ -0,0 +1,260 @@
+const std = @import("std");
+const Self = @This();
+
+const Keyword = @import("Keyword.zig").Keyword;
+const Punctuator = @import("Punctuator.zig").Punctuator;
+const Token = @import("Token.zig").Token;
+const Utf8Iterator = @import("Utf8Iterator.zig");
+
+pub const max_string_length = 4096;
+pub const max_wide_string_length = 4096;
+
+filename: []const u8,
+it: Utf8Iterator,
+defines: std.StringHashMapUnmanaged([]Token) = .{},
+/// Bounded, preallocated with the capacity of `max_string_length`.
+string: std.ArrayList(u8),
+/// Bounded, preallocated with the capacity of `max_wide_string_length`.
+wide_string: std.ArrayList(u32),
+
+pub fn init(filename: []const u8, code: []const u8, arena_allocator: std.mem.Allocator) !Self {
+    const string_buffer = try arena_allocator.alloc(u8, max_string_length);
+    const wide_string_buffer = try arena_allocator.alloc(u8, max_wide_string_length);
+
+    return .{
+        .filename = filename,
+        .it = .init(code),
+        .string = .initBuffer(string_buffer),
+        .wide_string = .initBuffer(wide_string_buffer),
+    };
+}
+
+pub fn setSource(self: *Self, filename: []const u8, code: []const u8) void {
+    self.filename = filename;
+    self.it = .init(code);
+}
+
+pub fn nextToken(self: *Self, arena_allocator: std.mem.Allocator) !?Token {
+    self.skipWhitespace();
+
+    // TODO Skip C and C++ style comments
+    // TODO Preprocessor directives
+
+    const cp = try self.peekCodepointSkipLineContinuation() orelse return null;
+
+    switch (cp) {
+        // Identifier start
+        'A'...'Z', '_', 'a'...'z', 128...std.math.maxInt(u21) => {
+            // This is an identifier, with the possible exception of:
+            // - wide string: L"
+            // - wide char:   L'
+            // - any keyword
+
+            if (cp == 'L') {
+                const state = self.it.save();
+
+                self.it.advanceCodepoint(cp);
+                const cp2 = self.peekCodepointSkipLineContinuation() orelse 0;
+
+                switch (cp2) {
+                    // Wide string
+                    '\"' => {
+                        self.it.advanceCodepoint(cp2);
+                        self.wide_string.clearRetainingCapacity();
+                        // TODO Parse wide string
+                    },
+                    // Wide char
+                    '\'' => {
+                        self.it.advanceCodepoint(cp2);
+                        // TODO Parse wide char
+                    },
+                    // Identifier or keyword
+                    else => {
+                        self.it.restore(state);
+                    },
+                }
+            }
+
+            const identifier_start = self.it.ptr;
+            self.it.advanceCodepoint(cp);
+
+            var next_cp = try self.peekCodepointSkipLineContinuation();
+            while (next_cp != null and isIdentifierMiddle(next_cp.?)) {
+                self.it.advanceCodepoint(next_cp.?);
+                next_cp = try self.peekCodepointSkipLineContinuation();
+            }
+
+            const identifier = self.str[identifier_start..self.it.ptr];
+
+            if (Keyword.isKeyword(identifier)) |keyword| {
+                return .{ .keyword = keyword };
+            } else {
+                // TODO Preprocessor
+                return .{ .identifier = arena_allocator.dupe(u8, identifier) };
+            }
+        },
+        // String
+        '\"' => {
+            self.it.advanceCodepoint(cp);
+            self.string.clearRetainingCapacity();
+            // TODO Parse string
+        },
+        // Char
+        '\'' => {
+            self.it.advanceCodepoint(cp);
+            // TODO Parse char
+        },
+    }
+
+    // Higher code points should've been already handled. The code below may
+    // assume that `cp` is an ASCII character.
+    std.debug.assert(cp < 128);
+
+    // TODO Numeric constants
+
+    const cp3 = self.it.peekThreeBytes().?;
+
+    switch (cp3 & 0x00_FF_FF_FF) {
+        inline @intFromEnum(Punctuator.@"..."),
+        @intFromEnum(Punctuator.@"<<="),
+        @intFromEnum(Punctuator.@">>="),
+        => |p| {
+            self.it.ptr += 3;
+            self.it.col += 3;
+            return .{
+                .punctuator = @enumFromInt(p),
+            };
+        },
+        else => {},
+    }
+
+    switch (cp3 & 0x00_00_FF_FF) {
+        inline @intFromEnum(Punctuator.@"--"),
+        @intFromEnum(Punctuator.@"-="),
+        @intFromEnum(Punctuator.@"->"),
+        @intFromEnum(Punctuator.@"!="),
+        @intFromEnum(Punctuator.@"*="),
+        @intFromEnum(Punctuator.@"/="),
+        @intFromEnum(Punctuator.@"&&"),
+        @intFromEnum(Punctuator.@"&="),
+        @intFromEnum(Punctuator.@"##"),
+        @intFromEnum(Punctuator.@"%="),
+        @intFromEnum(Punctuator.@"^="),
+        @intFromEnum(Punctuator.@"++"),
+        @intFromEnum(Punctuator.@"+="),
+        @intFromEnum(Punctuator.@"<<"),
+        @intFromEnum(Punctuator.@"<="),
+        @intFromEnum(Punctuator.@"=="),
+        @intFromEnum(Punctuator.@">="),
+        @intFromEnum(Punctuator.@">>"),
+        @intFromEnum(Punctuator.@"|="),
+        @intFromEnum(Punctuator.@"||"),
+        => |p| {
+            self.it.ptr += 2;
+            self.it.col += 2;
+            return .{
+                .punctuator = @enumFromInt(p),
+            };
+        },
+        else => {},
+    }
+
+    switch (cp3 & 0x00_00_00_FF) {
+        inline @intFromEnum(Punctuator.@"-"),
+        @intFromEnum(Punctuator.@","),
+        @intFromEnum(Punctuator.@";"),
+        @intFromEnum(Punctuator.@":"),
+        @intFromEnum(Punctuator.@"!"),
+        @intFromEnum(Punctuator.@"?"),
+        @intFromEnum(Punctuator.@"."),
+        @intFromEnum(Punctuator.@"("),
+        @intFromEnum(Punctuator.@")"),
+        @intFromEnum(Punctuator.@"["),
+        @intFromEnum(Punctuator.@"]"),
+        @intFromEnum(Punctuator.@"{"),
+        @intFromEnum(Punctuator.@"}"),
+        @intFromEnum(Punctuator.@"*"),
+        @intFromEnum(Punctuator.@"/"),
+        @intFromEnum(Punctuator.@"&"),
+        @intFromEnum(Punctuator.@"#"),
+        @intFromEnum(Punctuator.@"%"),
+        @intFromEnum(Punctuator.@"^"),
+        @intFromEnum(Punctuator.@"+"),
+        @intFromEnum(Punctuator.@"<"),
+        @intFromEnum(Punctuator.@"="),
+        @intFromEnum(Punctuator.@">"),
+        @intFromEnum(Punctuator.@"|"),
+        @intFromEnum(Punctuator.@"~"),
+        => |p| {
+            self.it.ptr += 1;
+            self.it.col += 1;
+            return .{
+                .punctuator = @enumFromInt(p),
+            };
+        },
+        else => {},
+    }
+
+    return error.InvalidToken;
+}
+
+fn peekCodepointSkipLineContinuation(self: *Self) !?u21 {
+    while (self.skipLineContinuation()) {}
+    const cp = try self.it.peekCodepoint();
+    return cp;
+}
+
+/// Line continuation is defined as a backslash followed imediatelly by LF or
+/// CRLF. Return whether a line continuation was encountered and therefore
+/// skipped past.
+fn skipLineContinuation(self: *Self) bool {
+    if (self.it.peekThreeBytes()) |b| {
+        @branchHint(.likely);
+        if (b & 0x00_00_FF_FF == Punctuator.line_continuation_lf) {
+            @branchHint(.unlikely);
+            self.it.ptr += 2;
+            self.it.line += 1;
+            self.it.col = 1;
+            return true;
+        } else if (b & 0x00_FF_FF_FF == Punctuator.line_continuation_crlf) {
+            @branchHint(.unlikely);
+            self.it.ptr += 3;
+            self.it.line += 1;
+            self.it.col = 1;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+fn skipWhitespace(self: *Self) !void {
+    while (try self.peekCodepointSkipLineContinuation()) |cp| {
+        switch (cp) {
+            // <Character Tabulation> (HT, TAB)
+            0x0009,
+            // <End of Line> (EOL, LF, NL)
+            0x000A,
+            // <Line Tabulation> (VT)
+            0x000B,
+            // <Form Feed> (FF)
+            0x000C,
+            // <Carriage Return> (CR)
+            0x000D,
+            // Space (SP)
+            0x0020,
+            => try self.it.advanceCodepoint(cp),
+            else => return,
+        }
+    }
+}
+
+fn isIdentifierMiddle(code_point: u21) bool {
+    // zig fmt: off
+    return code_point >= '0' and code_point <= '9'
+        or code_point >= 'A' and code_point <= 'Z'
+        or code_point == '_'
+        or code_point >= 'a' and code_point <= 'z'
+        or code_point >= 128;
+    // zig fmt: on
+}
--- a/packages/cjit/src/tokens/Utf8Iterator.zig
+++ b/packages/cjit/src/tokens/Utf8Iterator.zig
@@ -0,0 +1,103 @@
+const std = @import("std");
+const Self = @This();
+
+str: []const u8,
+ptr: usize,
+line: usize,
+col: usize,
+
+pub const State = struct {
+    ptr: usize,
+    line: usize,
+    col: usize,
+};
+
+pub fn init(str: []const u8) Self {
+    return .{
+        .str = str,
+        .ptr = 0,
+        .line = 1,
+        .col = 1,
+    };
+}
+
+pub fn save(self: Self) State {
+    return .{
+        .ptr = self.ptr,
+        .line = self.line,
+        .col = self.col,
+    };
+}
+
+pub fn restore(self: *Self, state: State) void {
+    self.ptr = state.ptr;
+    self.line = state.line;
+    self.col = state.col;
+}
+
+pub fn peekByte(self: *Self) ?u8 {
+    if (self.ptr >= self.str.len) {
+        return null;
+    }
+
+    return self.str[self.ptr];
+}
+
+pub fn peekCodepoint(self: Self) !?u21 {
+    if (self.ptr >= self.str.len) {
+        return null;
+    }
+
+    const cp_len = std.unicode.utf8ByteSequenceLength(self.str[self.ptr]) catch return error.InvalidUtf8;
+    if (self.ptr + cp_len > self.str.len) return error.InvalidUtf8;
+
+    const cp_slice = self.str[self.ptr .. self.ptr + cp_len];
+    const cp = std.unicode.utf8Decode(cp_slice) catch return error.InvalidUtf8;
+
+    return cp;
+}
+
+pub fn peekThreeBytes(self: Self) ?u32 {
+    var bytes: [3]u8 = .{ 0, 0, 0 };
+
+    const bytes_left = self.str.len - self.ptr;
+    sw: switch (bytes_left) {
+        0 => return null,
+        1 => {
+            bytes[0] = self.str[self.ptr];
+            return @as(u24, @bitCast(bytes));
+        },
+        2 => {
+            bytes[1] = self.str[self.ptr + 1];
+            continue :sw 1;
+        },
+        else => {
+            bytes[2] = self.str[self.ptr + 2];
+            continue :sw 2;
+        },
+    }
+}
+
+pub fn advanceAsciiBytes(self: *Self, bytes: usize) void {
+    std.debug.assert(self.str.ptr + bytes <= self.str.len);
+}
+
+/// Call with value returned by `peekCodepoint`.
+pub fn advanceCodepoint(self: *Self, cp: u21) void {
+    std.debug.assert(blk: {
+        const actual_cp = self.peekCodepoint() catch break :blk false;
+        break :blk cp == actual_cp;
+    });
+
+    const cp_len = std.unicode.utf8CodepointSequenceLength(cp) catch unreachable;
+
+    self.ptr += cp_len;
+
+    if (cp == '\n') {
+        self.line += 1;
+        // NOTE Columns start as 1, it will be incremented below.
+        self.col = 0;
+    }
+
+    self.col += 1;
+}