diff --git a/packages/cjit/src/tokens.zig b/packages/cjit/src/tokens.zig index a0eb9d2..3dc9fbe 100644 --- a/packages/cjit/src/tokens.zig +++ b/packages/cjit/src/tokens.zig @@ -1,149 +1,6 @@ -pub const Keyword = enum { - _Alignas, - _Alignof, - _Atomic, - _Bool, - _Complex, - _Generic, - _Imaginary, - _Noreturn, - _Static_assert, - _Thread_local, - auto, - @"break", - case, - char, - @"const", - @"continue", - default, - do, - double, - @"else", - @"enum", - @"extern", - float, - @"for", - goto, - @"if", - @"inline", - int, - long, - register, - restrict, - @"return", - short, - signed, - sizeof, - static, - @"struct", - @"switch", - typedef, - @"union", - unsigned, - void, - @"volatile", - @"while", -}; - -pub const Identifier = struct { - name: []const u8, -}; - -pub const Constant = union(enum) { - int: i32, - long: i64, - long_long: i64, - unsigned_int: u32, - unsigned_long: u64, - unsigned_long_long: u64, - float: f32, - double: f64, - character: u8, - wide_character: u32, -}; - -pub const StringLiteral = struct { - value: []const u8, -}; - -pub const Punctuator = enum { - // three characters - @"...", - @"<<=", - @">>=", - // two characters - @"--", - @"-=", - @"->", - @"!=", - @"*=", - @"/=", - @"&&", - @"&=", - @"##", - @"%=", - @"^=", - @"++", - @"+=", - @"<<", - @"<=", - @"==", - @">=", - @">>", - @"|=", - @"||", - // single character - @"-", - @",", - @";", - @":", - @"!", - @"?", - @".", - @"(", - @")", - @"[", - @"]", - @"{", - @"}", - @"*", - @"/", - @"&", - @"#", - @"%", - @"^", - @"+", - @"<", - @"=", - @">", - @"|", - @"~", -}; - -pub const Token = union(enum) { - keyword: Keyword, - identifier: []const u8, - constant: Constant, - string_literal: []const u8, - wide_string_literal: []const u32, - punctuator: Punctuator, -}; - -pub fn isIdentifierStart(code_point: u21) bool { - // zig fmt: off - return code_point >= 'A' and code_point <= 'Z' - or code_point == '_' - or code_point >= 'a' and code_point <= 'z' - or code_point >= 128; - // zig fmt: on -} - -pub fn isIdentifierMiddle(code_point: u21) bool { - // zig fmt: off - return code_point >= '0' and code_point <= '9' - or code_point >= 'A' and code_point <= 'Z' - or code_point == '_' - or code_point >= 'a' and code_point <= 'z' - or code_point >= 128; - // zig fmt: on -} +pub const Constant = @import("tokens/Constant.zig"); +pub const Keyword = @import("tokens/Keyword.zig"); +pub const Punctuator = @import("tokens/Punctuator.zig"); +pub const Token = @import("tokens/Token.zig"); +pub const Tokenizer = @import("tokens/Tokenizer.zig"); +pub const Utf8Iterator = @import("tokens/Utf8Iterator.zig"); diff --git a/packages/cjit/src/tokens/Constant.zig b/packages/cjit/src/tokens/Constant.zig new file mode 100644 index 0000000..15e396b --- /dev/null +++ b/packages/cjit/src/tokens/Constant.zig @@ -0,0 +1,14 @@ +const std = @import("std"); + +pub const Constant = union(enum) { + int: i32, + long: i64, + long_long: i64, + unsigned_int: u32, + unsigned_long: u64, + unsigned_long_long: u64, + float: f32, + double: f64, + character: u8, + wide_character: i32, +}; diff --git a/packages/cjit/src/tokens/Keyword.zig b/packages/cjit/src/tokens/Keyword.zig new file mode 100644 index 0000000..20c50c6 --- /dev/null +++ b/packages/cjit/src/tokens/Keyword.zig @@ -0,0 +1,63 @@ +const std = @import("std"); + +pub const Keyword = enum { + _Alignas, + _Alignof, + _Atomic, + _Bool, + _Complex, + _Generic, + _Imaginary, + _Noreturn, + _Static_assert, + _Thread_local, + auto, + @"break", + case, + char, + @"const", + @"continue", + default, + do, + double, + @"else", + @"enum", + @"extern", + float, + @"for", + goto, + @"if", + @"inline", + int, + long, + register, + restrict, + @"return", + short, + signed, + sizeof, + static, + @"struct", + @"switch", + typedef, + @"union", + unsigned, + void, + @"volatile", + @"while", + + pub const map: std.StaticStringMap(Keyword) = blk: { + const fields = @typeInfo(Keyword).@"enum".fields; + + var kvs_list: [fields.len]struct { []const u8, Keyword } = undefined; + for (fields, 0..) |field, i| { + kvs_list[i] = .{ field.name, @field(Keyword, field.name) }; + } + + break :blk .initComptime(kvs_list); + }; + + pub fn isKeyword(identifier: []const u8) ?Keyword { + return map.get(identifier); + } +}; diff --git a/packages/cjit/src/tokens/Punctuator.zig b/packages/cjit/src/tokens/Punctuator.zig new file mode 100644 index 0000000..db38634 --- /dev/null +++ b/packages/cjit/src/tokens/Punctuator.zig @@ -0,0 +1,70 @@ +const std = @import("std"); + +pub const Punctuator = enum(u32) { + // three characters + @"..." = strToInt3("..."), + @"<<=" = strToInt3("<<="), + @">>=" = strToInt3(">>="), + // two characters + @"--" = strToInt2("--"), + @"-=" = strToInt2("-="), + @"->" = strToInt2("->"), + @"!=" = strToInt2("!="), + @"*=" = strToInt2("*="), + @"/=" = strToInt2("/="), + @"&&" = strToInt2("&&"), + @"&=" = strToInt2("&="), + @"##" = strToInt2("##"), + @"%=" = strToInt2("%="), + @"^=" = strToInt2("^="), + @"++" = strToInt2("++"), + @"+=" = strToInt2("+="), + @"<<" = strToInt2("<<"), + @"<=" = strToInt2("<="), + @"==" = strToInt2("=="), + @">=" = strToInt2(">="), + @">>" = strToInt2(">>"), + @"|=" = strToInt2("|="), + @"||" = strToInt2("||"), + // single character + @"-" = strToInt1("-"), + @"," = strToInt1(","), + @";" = strToInt1(";"), + @":" = strToInt1(":"), + @"!" = strToInt1("!"), + @"?" = strToInt1("?"), + @"." = strToInt1("."), + @"(" = strToInt1("("), + @")" = strToInt1(")"), + @"[" = strToInt1("["), + @"]" = strToInt1("]"), + @"{" = strToInt1("{"), + @"}" = strToInt1("}"), + @"*" = strToInt1("*"), + @"/" = strToInt1("/"), + @"&" = strToInt1("&"), + @"#" = strToInt1("#"), + @"%" = strToInt1("%"), + @"^" = strToInt1("^"), + @"+" = strToInt1("+"), + @"<" = strToInt1("<"), + @"=" = strToInt1("="), + @">" = strToInt1(">"), + @"|" = strToInt1("|"), + @"~" = strToInt1("~"), + + pub const line_continuation_lf = strToInt2("\\\n"); + pub const line_continuation_crlf = strToInt3("\\\r\n"); +}; + +pub fn strToInt1(str: *const u8[1]) u32 { + return @as(u8, @bitCast(str.*)); +} + +pub fn strToInt2(str: *const u8[2]) u32 { + return @as(u16, @bitCast(str.*)); +} + +pub fn strToInt3(str: *const u8[3]) u32 { + return @as(u24, @bitCast(str.*)); +} diff --git a/packages/cjit/src/tokens/Token.zig b/packages/cjit/src/tokens/Token.zig new file mode 100644 index 0000000..9c865a2 --- /dev/null +++ b/packages/cjit/src/tokens/Token.zig @@ -0,0 +1,14 @@ +const std = @import("std"); + +pub const Constant = @import("Constant.zig"); +pub const Keyword = @import("Keyword.zig"); +pub const Punctuator = @import("Punctuator.zig"); + +pub const Token = union(enum) { + keyword: Keyword, + identifier: []const u8, + constant: Constant, + string_literal: [:0]const u8, + wide_string_literal: [:0]const u32, + punctuator: Punctuator, +}; diff --git a/packages/cjit/src/tokens/Tokenizer.zig b/packages/cjit/src/tokens/Tokenizer.zig new file mode 100644 index 0000000..de528dd --- /dev/null +++ b/packages/cjit/src/tokens/Tokenizer.zig @@ -0,0 +1,260 @@ +const std = @import("std"); +const Self = @This(); + +const Keyword = @import("Keyword.zig").Keyword; +const Punctuator = @import("Punctuator.zig").Punctuator; +const Token = @import("Token.zig").Token; +const Utf8Iterator = @import("Utf8Iterator.zig"); + +pub const max_string_length = 4096; +pub const max_wide_string_length = 4096; + +filename: []const u8, +it: Utf8Iterator, +defines: std.StringHashMapUnmanaged([]Token) = .{}, +/// Bounded, preallocated with the capacity of `max_string_length`. +string: std.ArrayList(u8), +/// Bounded, preallocated with the capacity of `max_wide_string_length`. +wide_string: std.ArrayList(u32), + +pub fn init(filename: []const u8, code: []const u8, arena_allocator: std.mem.Allocator) !Self { + const string_buffer = try arena_allocator.alloc(u8, max_string_length); + const wide_string_buffer = try arena_allocator.alloc(u8, max_wide_string_length); + + return .{ + .filename = filename, + .it = .init(code), + .string = .initBuffer(string_buffer), + .wide_string = .initBuffer(wide_string_buffer), + }; +} + +pub fn setSource(self: *Self, filename: []const u8, code: []const u8) void { + self.filename = filename; + self.it = .init(code); +} + +pub fn nextToken(self: *Self, arena_allocator: std.mem.Allocator) !?Token { + self.skipWhitespace(); + + // TODO Skip C and C++ style comments + // TODO Preprocessor directives + + const cp = try self.peekCodepointSkipLineContinuation() orelse return null; + + switch (cp) { + // Identifier start + 'A'...'Z', '_', 'a'...'z', 128...std.math.maxInt(u21) => { + // This is an identifier, with the possible exception of: + // - wide string: L" + // - wide char: L' + // - any keyword + + if (cp == 'L') { + const state = self.it.save(); + + self.it.advanceCodepoint(cp); + const cp2 = self.peekCodepointSkipLineContinuation() orelse 0; + + switch (cp2) { + // Wide string + '\"' => { + self.it.advanceCodepoint(cp2); + self.wide_string.clearRetainingCapacity(); + // TODO Parse wide string + }, + // Wide char + '\'' => { + self.it.advanceCodepoint(cp2); + // TODO Parse wide char + }, + // Identifier or keyword + else => { + self.it.restore(state); + }, + } + } + + const identifier_start = self.it.ptr; + self.it.advanceCodepoint(cp); + + var next_cp = try self.peekCodepointSkipLineContinuation(); + while (next_cp != null and isIdentifierMiddle(next_cp.?)) { + self.it.advanceCodepoint(next_cp.?); + next_cp = try self.peekCodepointSkipLineContinuation(); + } + + const identifier = self.str[identifier_start..self.it.ptr]; + + if (Keyword.isKeyword(identifier)) |keyword| { + return .{ .keyword = keyword }; + } else { + // TODO Preprocessor + return .{ .identifier = arena_allocator.dupe(u8, identifier) }; + } + }, + // String + '\"' => { + self.it.advanceCodepoint(cp); + self.string.clearRetainingCapacity(); + // TODO Parse string + }, + // Char + '\'' => { + self.it.advanceCodepoint(cp); + // TODO Parse char + }, + } + + // Higher code points should've been already handled. The code below may + // assume that `cp` is an ASCII character. + std.debug.assert(cp < 128); + + // TODO Numeric constants + + const cp3 = self.it.peekThreeBytes().?; + + switch (cp3 & 0x00_FF_FF_FF) { + inline @intFromEnum(Punctuator.@"..."), + @intFromEnum(Punctuator.@"<<="), + @intFromEnum(Punctuator.@">>="), + => |p| { + self.it.ptr += 3; + self.it.col += 3; + return .{ + .punctuator = @enumFromInt(p), + }; + }, + else => {}, + } + + switch (cp3 & 0x00_00_FF_FF) { + inline @intFromEnum(Punctuator.@"--"), + @intFromEnum(Punctuator.@"-="), + @intFromEnum(Punctuator.@"->"), + @intFromEnum(Punctuator.@"!="), + @intFromEnum(Punctuator.@"*="), + @intFromEnum(Punctuator.@"/="), + @intFromEnum(Punctuator.@"&&"), + @intFromEnum(Punctuator.@"&="), + @intFromEnum(Punctuator.@"##"), + @intFromEnum(Punctuator.@"%="), + @intFromEnum(Punctuator.@"^="), + @intFromEnum(Punctuator.@"++"), + @intFromEnum(Punctuator.@"+="), + @intFromEnum(Punctuator.@"<<"), + @intFromEnum(Punctuator.@"<="), + @intFromEnum(Punctuator.@"=="), + @intFromEnum(Punctuator.@">="), + @intFromEnum(Punctuator.@">>"), + @intFromEnum(Punctuator.@"|="), + @intFromEnum(Punctuator.@"||"), + => |p| { + self.it.ptr += 2; + self.it.col += 2; + return .{ + .punctuator = @enumFromInt(p), + }; + }, + else => {}, + } + + switch (cp3 & 0x00_00_00_FF) { + inline @intFromEnum(Punctuator.@"-"), + @intFromEnum(Punctuator.@","), + @intFromEnum(Punctuator.@";"), + @intFromEnum(Punctuator.@":"), + @intFromEnum(Punctuator.@"!"), + @intFromEnum(Punctuator.@"?"), + @intFromEnum(Punctuator.@"."), + @intFromEnum(Punctuator.@"("), + @intFromEnum(Punctuator.@")"), + @intFromEnum(Punctuator.@"["), + @intFromEnum(Punctuator.@"]"), + @intFromEnum(Punctuator.@"{"), + @intFromEnum(Punctuator.@"}"), + @intFromEnum(Punctuator.@"*"), + @intFromEnum(Punctuator.@"/"), + @intFromEnum(Punctuator.@"&"), + @intFromEnum(Punctuator.@"#"), + @intFromEnum(Punctuator.@"%"), + @intFromEnum(Punctuator.@"^"), + @intFromEnum(Punctuator.@"+"), + @intFromEnum(Punctuator.@"<"), + @intFromEnum(Punctuator.@"="), + @intFromEnum(Punctuator.@">"), + @intFromEnum(Punctuator.@"|"), + @intFromEnum(Punctuator.@"~"), + => |p| { + self.it.ptr += 1; + self.it.col += 1; + return .{ + .punctuator = @enumFromInt(p), + }; + }, + else => {}, + } + + return error.InvalidToken; +} + +fn peekCodepointSkipLineContinuation(self: *Self) !?u21 { + while (self.skipLineContinuation()) {} + const cp = try self.it.peekCodepoint(); + return cp; +} + +/// Line continuation is defined as a backslash followed imediatelly by LF or +/// CRLF. Return whether a line continuation was encountered and therefore +/// skipped past. +fn skipLineContinuation(self: *Self) bool { + if (self.it.peekThreeBytes()) |b| { + @branchHint(.likely); + if (b & 0x00_00_FF_FF == Punctuator.line_continuation_lf) { + @branchHint(.unlikely); + self.it.ptr += 2; + self.it.line += 1; + self.it.col = 1; + return true; + } else if (b & 0x00_FF_FF_FF == Punctuator.line_continuation_crlf) { + @branchHint(.unlikely); + self.it.ptr += 3; + self.it.line += 1; + self.it.col = 1; + return true; + } + } + + return false; +} + +fn skipWhitespace(self: *Self) !void { + while (try self.peekCodepointSkipLineContinuation()) |cp| { + switch (cp) { + // (HT, TAB) + 0x0009, + // (EOL, LF, NL) + 0x000A, + // (VT) + 0x000B, + //
(FF) + 0x000C, + // (CR) + 0x000D, + // Space (SP) + 0x0020, + => try self.it.advanceCodepoint(cp), + else => return, + } + } +} + +fn isIdentifierMiddle(code_point: u21) bool { + // zig fmt: off + return code_point >= '0' and code_point <= '9' + or code_point >= 'A' and code_point <= 'Z' + or code_point == '_' + or code_point >= 'a' and code_point <= 'z' + or code_point >= 128; + // zig fmt: on +} diff --git a/packages/cjit/src/tokens/Utf8Iterator.zig b/packages/cjit/src/tokens/Utf8Iterator.zig new file mode 100644 index 0000000..acf4283 --- /dev/null +++ b/packages/cjit/src/tokens/Utf8Iterator.zig @@ -0,0 +1,103 @@ +const std = @import("std"); +const Self = @This(); + +str: []const u8, +ptr: usize, +line: usize, +col: usize, + +pub const State = struct { + ptr: usize, + line: usize, + col: usize, +}; + +pub fn init(str: []const u8) Self { + return .{ + .str = str, + .ptr = 0, + .line = 1, + .col = 1, + }; +} + +pub fn save(self: Self) State { + return .{ + .ptr = self.ptr, + .line = self.line, + .col = self.col, + }; +} + +pub fn restore(self: *Self, state: State) void { + self.ptr = state.ptr; + self.line = state.line; + self.col = state.col; +} + +pub fn peekByte(self: *Self) ?u8 { + if (self.ptr >= self.str.len) { + return null; + } + + return self.str[self.ptr]; +} + +pub fn peekCodepoint(self: Self) !?u21 { + if (self.ptr >= self.str.len) { + return null; + } + + const cp_len = std.unicode.utf8ByteSequenceLength(self.str[self.ptr]) catch return error.InvalidUtf8; + if (self.ptr + cp_len > self.str.len) return error.InvalidUtf8; + + const cp_slice = self.str[self.ptr .. self.ptr + cp_len]; + const cp = std.unicode.utf8Decode(cp_slice) catch return error.InvalidUtf8; + + return cp; +} + +pub fn peekThreeBytes(self: Self) ?u32 { + var bytes: [3]u8 = .{ 0, 0, 0 }; + + const bytes_left = self.str.len - self.ptr; + sw: switch (bytes_left) { + 0 => return null, + 1 => { + bytes[0] = self.str[self.ptr]; + return @as(u24, @bitCast(bytes)); + }, + 2 => { + bytes[1] = self.str[self.ptr + 1]; + continue :sw 1; + }, + else => { + bytes[2] = self.str[self.ptr + 2]; + continue :sw 2; + }, + } +} + +pub fn advanceAsciiBytes(self: *Self, bytes: usize) void { + std.debug.assert(self.str.ptr + bytes <= self.str.len); +} + +/// Call with value returned by `peekCodepoint`. +pub fn advanceCodepoint(self: *Self, cp: u21) void { + std.debug.assert(blk: { + const actual_cp = self.peekCodepoint() catch break :blk false; + break :blk cp == actual_cp; + }); + + const cp_len = std.unicode.utf8CodepointSequenceLength(cp) catch unreachable; + + self.ptr += cp_len; + + if (cp == '\n') { + self.line += 1; + // NOTE Columns start as 1, it will be incremented below. + self.col = 0; + } + + self.col += 1; +}