1use std::{iter::Peekable, ops::Range, str::CharIndices};
3use unicode_ident::{is_xid_continue, is_xid_start};
4
5use cl_structures::span::Span;
6use cl_token::*;
7
8pub use cl_structures::intern::interned::Symbol;
9
10#[derive(Clone, Copy, Debug, PartialEq, Eq)]
11pub struct LexError {
12 pub pos: Span,
13 pub res: LexFailure,
14}
15
16impl std::error::Error for LexError {}
17impl std::fmt::Display for LexError {
18 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19 let Self { pos, res } = self;
20 write!(f, "{pos}: {res}")
21 }
22}
23
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25pub enum LexFailure {
26 EOF,
28 UnexpectedEOF,
29 Unexpected(char),
30 UnterminatedBlockComment,
31 UnterminatedCharacter,
32 UnterminatedString,
33 UnterminatedUnicodeEscape,
34 InvalidUnicodeEscape(u32),
35 InvalidDigitForBase(char, u32),
36 IntegerOverflow,
37}
38use LexFailure::*;
39pub use LexFailure::{EOF, UnexpectedEOF};
40
41impl std::fmt::Display for LexFailure {
42 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43 match self {
44 Self::EOF => "EOF".fmt(f),
45 Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
46 Self::Unexpected(c) => write!(f, "Character {c:?}"),
47 Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
48 Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
49 Self::UnterminatedString => "Unterminated String".fmt(f),
50 Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
51 Self::InvalidUnicodeEscape(hex) => {
52 write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
53 }
54 Self::InvalidDigitForBase(digit, base) => {
55 write!(f, "Invalid digit {digit} for base {base}")
56 }
57 Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
58 }
59 }
60}
61
62#[derive(Clone, Debug)]
63pub struct Lexer<'t> {
64 path: Symbol,
65 text: &'t str,
67 iter: Peekable<CharIndices<'t>>,
69 head: u32,
71 tail: u32,
73}
74
75impl<'t> Lexer<'t> {
76 pub fn new(path: Symbol, text: &'t str) -> Self {
78 let iter = text.char_indices().peekable();
79 Self { path, text, iter, head: 0, tail: 0 }
80 }
81
82 pub const fn span(&self) -> Span {
87 Span(self.path, self.head, self.tail)
88 }
89
90 pub fn peek(&mut self) -> Option<char> {
92 self.iter.peek().map(|&(_, c)| c)
93 }
94
95 fn advance_tail(&mut self) {
97 match self.iter.peek() {
98 Some(&(idx, _)) => self.tail = idx as u32,
99 None => self.tail = self.text.len() as _,
100 }
101 }
102
103 fn take(&mut self) -> Option<char> {
105 let (_, c) = self.iter.next()?;
106 self.advance_tail();
107 Some(c)
108 }
109
110 fn next_if(&mut self, expected: char) -> Option<char> {
111 let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
112 self.advance_tail();
113 Some(c)
114 }
115
116 fn consume(&mut self) -> &mut Self {
118 self.iter.next();
119 self.advance_tail();
120 self
121 }
122
123 const fn error(&self, res: LexFailure) -> LexError {
125 LexError { pos: self.span(), res }
126 }
127
128 fn as_str(&self) -> (&'t str, Span) {
130 let span = self.span();
131 (&self.text[Range::from(span)], span)
132 }
133
134 fn produce(&mut self, kind: TKind) -> Token {
136 self.advance_tail();
137 let (lexeme, span) = self.as_str();
138 self.head = self.tail;
139 Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
140 }
141
142 fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
143 self.advance_tail();
144 let span = self.span();
145 self.head = self.tail;
146 Token { lexeme, kind, span }
147 }
148
149 fn skip_whitespace(&mut self) -> &mut Self {
151 while self.peek().is_some_and(char::is_whitespace) {
152 let _ = self.consume();
153 }
154 self
155 }
156
157 const fn start_token(&mut self) -> &mut Self {
158 self.head = self.tail;
159 self
160 }
161
162 pub fn scan(&mut self) -> Result<Token, LexError> {
164 use TKind::*;
165 let tok = match self
167 .skip_whitespace()
168 .start_token()
169 .peek()
170 .ok_or_else(|| self.error(EOF))?
171 {
172 '!' => Bang,
173 '"' => return self.string(),
174 '#' => Hash,
175 '$' => Dollar,
176 '%' => Rem,
177 '&' => Amp,
178 '\'' => return self.character(),
179 '(' => LParen,
180 ')' => RParen,
181 '*' => Star,
182 '+' => Plus,
183 ',' => return self.consume().trailing(Comma),
184 '-' => Minus,
185 '.' => Dot,
186 '/' => Slash,
187 '0' => Integer,
188 '1'..='9' => return self.digits::<10>(),
189 ':' => Colon,
190 ';' => Semi,
191 '<' => Lt,
192 '=' => Eq,
193 '>' => Gt,
194 '?' => Question,
195 '@' => At,
196 '[' => LBrack,
197 '\\' => Backslash,
198 ']' => RBrack,
199 '^' => Xor,
200 '`' => Grave,
201 '{' => LCurly,
202 '|' => Bar,
203 '}' => RCurly,
204 '~' => Tilde,
205 '_' => return self.identifier(),
206 c if is_xid_start(c) => return self.identifier(),
207 c => Err(self.error(Unexpected(c)))?,
208 };
209
210 let tok = match (tok, self.consume().peek()) {
212 (Integer, Some('b')) => return self.consume().digits::<2>(),
213 (Integer, Some('d')) => return self.consume().digits::<10>(),
214 (Integer, Some('o')) => return self.consume().digits::<8>(),
215 (Integer, Some('x')) => return self.consume().digits::<16>(),
216 (Integer, Some('~')) => return self.consume().digits::<36>(),
217 (Integer, _) => return self.digits::<10>(),
218 (Amp, Some('&')) => AmpAmp,
219 (Amp, Some('=')) => AmpEq,
220 (Bang, Some('!')) => BangBang,
221 (Bang, Some('=')) => BangEq,
222 (Bar, Some('|')) => BarBar,
223 (Bar, Some('=')) => BarEq,
224 (Colon, Some(':')) => ColonColon,
225 (Dot, Some('.')) => DotDot,
226 (Eq, Some('=')) => EqEq,
227 (Eq, Some('>')) => FatArrow,
228 (Gt, Some('=')) => GtEq,
229 (Gt, Some('>')) => GtGt,
230 (Hash, Some('!')) => HashBang,
231 (Lt, Some('=')) => LtEq,
232 (Lt, Some('<')) => LtLt,
233 (Minus, Some('=')) => MinusEq,
234 (Minus, Some('>')) => Arrow,
235 (Plus, Some('=')) => PlusEq,
236 (Rem, Some('=')) => RemEq,
237 (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
238 (Slash, Some('=')) => SlashEq,
239 (Slash, Some('/')) => return self.line_comment(),
240 (Star, Some('=')) => StarEq,
241 (Xor, Some('=')) => XorEq,
242 (Xor, Some('^')) => XorXor,
243 _ => return Ok(self.produce(tok)),
244 };
245
246 let tok = match (tok, self.consume().peek()) {
248 (HashBang, Some('/')) => return self.line_comment(),
249 (DotDot, Some('.')) => DotDotDot,
250 (DotDot, Some('=')) => DotDotEq,
251 (GtGt, Some('=')) => GtGtEq,
252 (LtLt, Some('=')) => LtLtEq,
253 _ => return Ok(self.produce(tok)),
254 };
255
256 Ok(self.consume().produce(tok))
257 }
258
259 pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> {
261 Ok(match self.skip_whitespace().peek() {
262 Some('}') => self.consume().produce(TKind::RCurly),
265 _ => self.produce(kind),
266 })
267 }
268
269 pub fn line_comment(&mut self) -> Result<Token, LexError> {
271 let kind = match self.consume().peek() {
272 Some('/') => TKind::OutDoc,
273 Some('!') => TKind::InDoc,
274 _ => TKind::Comment,
275 };
276 while self.consume().peek().is_some_and(|c| c != '\n') {}
277 let (lexeme, _) = self.as_str();
278 let lexeme = lexeme
279 .strip_prefix("///")
280 .or_else(|| lexeme.strip_prefix("//!"))
281 .map(|lexeme| lexeme.strip_prefix(" ").unwrap_or(lexeme))
282 .unwrap_or(lexeme);
283
284 Ok(self.produce_with_lexeme(kind, Lexeme::String(lexeme.into())))
285 }
286
287 pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
290 self.consume();
291 while let Some(c) = self.take() {
292 match (c, self.peek()) {
293 ('/', Some('*')) => self.block_comment()?,
294 ('*', Some('/')) => return Ok(self.consume()),
295 _ => continue,
296 };
297 }
298 Err(self.error(UnterminatedBlockComment))
299 }
300
301 pub fn identifier(&mut self) -> Result<Token, LexError> {
307 while self.consume().peek().is_some_and(is_xid_continue) {}
308 let (lexeme, _span) = self.as_str();
309 let token = self.produce(TKind::Identifier);
310 Ok(Token {
311 kind: match lexeme {
312 "as" => TKind::As,
313 "break" => TKind::Break,
314 "catch" => TKind::Catch,
315 "const" => TKind::Const,
316 "continue" => TKind::Continue,
317 "defer" => TKind::Defer,
318 "do" => TKind::Do,
319 "else" => TKind::Else,
320 "enum" => TKind::Enum,
321 "false" => TKind::False,
322 "fn" => TKind::Fn,
323 "for" => TKind::For,
324 "if" => TKind::If,
325 "impl" => TKind::Impl,
326 "in" => TKind::In,
327 "let" => TKind::Let,
328 "loop" => TKind::Loop,
329 "macro" => TKind::Macro,
330 "match" => TKind::Match,
331 "mod" => TKind::Mod,
332 "mut" => TKind::Mut,
333 "pub" => TKind::Pub,
334 "return" => TKind::Return,
335 "static" => TKind::Static,
336 "struct" => TKind::Struct,
337 "true" => TKind::True,
338 "try" => TKind::Try,
339 "type" => TKind::Type,
340 "use" => TKind::Use,
341 "while" => TKind::While,
342 _ => token.kind,
343 },
344 ..token
345 })
346 }
347
348 pub fn character(&mut self) -> Result<Token, LexError> {
350 let c = match self.consume().take() {
351 Some('\\') => self.escape()?,
352 Some(c) => c,
353 None => '\0',
354 };
355 if self.take().is_some_and(|c| c == '\'') {
356 Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
357 } else {
358 Err(self.error(UnterminatedCharacter))
359 }
360 }
361
362 pub fn string(&mut self) -> Result<Token, LexError> {
364 let mut lexeme = String::new();
365 self.consume();
366 loop {
367 lexeme.push(match self.take() {
368 None => Err(self.error(UnterminatedString))?,
369 Some('\\') => self.escape()?,
370 Some('"') => break,
371 Some(c) => c,
372 });
373 }
374 lexeme.shrink_to_fit();
375 Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
376 }
377
378 pub fn escape(&mut self) -> Result<char, LexError> {
380 Ok(
381 match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
382 ' ' => '\u{a0}', '0' => '\0', 'a' => '\x07', 'b' => '\x08', 'e' => '\x1b', 'f' => '\x0c', 'n' => '\n', 'r' => '\r', 't' => '\t', 'u' => self.unicode_escape()?,
392 'x' => self.hex_escape()?,
393 c => c,
394 },
395 )
396 }
397
398 pub fn hex_escape(&mut self) -> Result<char, LexError> {
400 let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
401 char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
402 }
403
404 pub fn unicode_escape(&mut self) -> Result<char, LexError> {
406 self.next_if('{')
407 .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
408 let mut out = 0;
409 while let Some(c) = self.take() {
410 if c == '}' {
411 return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
412 }
413 out = out.saturating_mul(16).saturating_add(
414 c.to_digit(16)
415 .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?,
416 );
417 }
418 Err(self.error(UnterminatedUnicodeEscape))
419 }
420
421 pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
426 let mut int: u128 = 0;
427 while let Some(c) = self.peek() {
428 int = match c.to_digit(BASE).ok_or(c) {
429 Err('_') => int,
430 Ok(c) => int
431 .checked_mul(BASE as _)
432 .and_then(|int| int.checked_add(c as _))
433 .ok_or_else(|| self.error(IntegerOverflow))?,
434 _ => break,
435 };
436 self.consume();
437 }
438
439 Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
440 }
441
442 pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
444 let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
445 if let Some(digit) = digit.to_digit(BASE) {
446 Ok(digit)
447 } else {
448 Err(self.error(InvalidDigitForBase(digit, BASE)))
449 }
450 }
451}