1use std::{iter::Peekable, ops::Range, str::CharIndices};
3use unicode_ident::{is_xid_continue, is_xid_start};
4
5use cl_structures::span::Span;
6use cl_token::*;
7
8pub use cl_structures::intern::interned::Symbol;
9
10#[derive(Clone, Copy, Debug, PartialEq, Eq)]
11pub struct LexError {
12 pub pos: Span,
13 pub res: LexFailure,
14}
15
16impl std::error::Error for LexError {}
17impl std::fmt::Display for LexError {
18 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19 let Self { pos, res } = self;
20 write!(f, "{pos}: {res}")
21 }
22}
23
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25pub enum LexFailure {
26 EOF,
28 UnexpectedEOF,
29 Unexpected(char),
30 UnterminatedBlockComment,
31 UnterminatedCharacter,
32 UnterminatedString,
33 UnterminatedUnicodeEscape,
34 InvalidUnicodeEscape(u32),
35 InvalidDigitForBase(char, u32),
36 IntegerOverflow,
37}
38use LexFailure::*;
39pub use LexFailure::{EOF, UnexpectedEOF};
40
41impl std::fmt::Display for LexFailure {
42 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43 match self {
44 Self::EOF => "EOF".fmt(f),
45 Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
46 Self::Unexpected(c) => write!(f, "Character {c:?}"),
47 Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
48 Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
49 Self::UnterminatedString => "Unterminated String".fmt(f),
50 Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
51 Self::InvalidUnicodeEscape(hex) => {
52 write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
53 }
54 Self::InvalidDigitForBase(digit, base) => {
55 write!(f, "Invalid digit {digit} for base {base}")
56 }
57 Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
58 }
59 }
60}
61
62#[derive(Clone, Debug)]
63pub struct Lexer<'t> {
64 path: Symbol,
65 text: &'t str,
67 iter: Peekable<CharIndices<'t>>,
69 head: u32,
71 tail: u32,
73}
74
75impl<'t> Lexer<'t> {
76 pub fn new(path: Symbol, text: &'t str) -> Self {
78 let iter = text.char_indices().peekable();
79 Self { path, text, iter, head: 0, tail: 0 }
80 }
81
82 pub const fn span(&self) -> Span {
87 Span(self.path, self.head, self.tail)
88 }
89
90 pub fn peek(&mut self) -> Option<char> {
92 self.iter.peek().map(|&(_, c)| c)
93 }
94
95 fn advance_tail(&mut self) {
97 match self.iter.peek() {
98 Some(&(idx, _)) => self.tail = idx as u32,
99 None => self.tail = self.text.len() as _,
100 }
101 }
102
103 fn take(&mut self) -> Option<char> {
105 let (_, c) = self.iter.next()?;
106 self.advance_tail();
107 Some(c)
108 }
109
110 fn next_if(&mut self, expected: char) -> Option<char> {
111 let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
112 self.advance_tail();
113 Some(c)
114 }
115
116 fn consume(&mut self) -> &mut Self {
118 self.iter.next();
119 self.advance_tail();
120 self
121 }
122
123 const fn error(&self, res: LexFailure) -> LexError {
125 LexError { pos: self.span(), res }
126 }
127
128 fn as_str(&self) -> (&'t str, Span) {
130 let span = self.span();
131 (&self.text[Range::from(span)], span)
132 }
133
134 fn produce(&mut self, kind: TKind) -> Token {
136 self.advance_tail();
137 let (lexeme, span) = self.as_str();
138 self.head = self.tail;
139 Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
140 }
141
142 fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
143 self.advance_tail();
144 let span = self.span();
145 self.head = self.tail;
146 Token { lexeme, kind, span }
147 }
148
149 fn skip_whitespace(&mut self) -> &mut Self {
151 while self.peek().is_some_and(char::is_whitespace) {
152 let _ = self.consume();
153 }
154 self
155 }
156
157 const fn start_token(&mut self) -> &mut Self {
158 self.head = self.tail;
159 self
160 }
161
162 pub fn scan(&mut self) -> Result<Token, LexError> {
164 use TKind::*;
165 let tok = match self
167 .skip_whitespace()
168 .start_token()
169 .peek()
170 .ok_or_else(|| self.error(EOF))?
171 {
172 '!' => Bang,
173 '"' => return self.string(),
174 '#' => Hash,
175 '$' => Dollar,
176 '%' => Rem,
177 '&' => Amp,
178 '\'' => return self.character(),
179 '(' => LParen,
180 ')' => RParen,
181 '*' => Star,
182 '+' => Plus,
183 ',' => return self.consume().trailing(Comma),
184 '-' => Minus,
185 '.' => Dot,
186 '/' => Slash,
187 '0' => Integer,
188 '1'..='9' => return self.digits::<10>(),
189 ':' => Colon,
190 ';' => Semi,
191 '<' => Lt,
192 '=' => Eq,
193 '>' => Gt,
194 '?' => Question,
195 '@' => At,
196 '[' => LBrack,
197 '\\' => Backslash,
198 ']' => RBrack,
199 '^' => Xor,
200 '`' => Grave,
201 '{' => LCurly,
202 '|' => Bar,
203 '}' => RCurly,
204 '~' => Tilde,
205 '_' => return self.identifier(),
206 c if is_xid_start(c) => return self.identifier(),
207 c => Err(self.error(Unexpected(c)))?,
208 };
209
210 let tok = match (tok, self.consume().peek()) {
212 (Integer, Some('b')) => return self.consume().digits::<2>(),
213 (Integer, Some('d')) => return self.consume().digits::<10>(),
214 (Integer, Some('o')) => return self.consume().digits::<8>(),
215 (Integer, Some('x')) => return self.consume().digits::<16>(),
216 (Integer, Some('~')) => return self.consume().digits::<36>(),
217 (Integer, _) => return self.digits::<10>(),
218 (Amp, Some('&')) => AmpAmp,
219 (Amp, Some('=')) => AmpEq,
220 (Bang, Some('!')) => BangBang,
221 (Bang, Some('=')) => BangEq,
222 (Bar, Some('|')) => BarBar,
223 (Bar, Some('=')) => BarEq,
224 (Colon, Some(':')) => ColonColon,
225 (Dot, Some('.')) => DotDot,
226 (Eq, Some('=')) => EqEq,
227 (Eq, Some('>')) => FatArrow,
228 (Gt, Some('=')) => GtEq,
229 (Gt, Some('>')) => GtGt,
230 (Hash, Some('!')) => HashBang,
231 (Lt, Some('=')) => LtEq,
232 (Lt, Some('<')) => LtLt,
233 (Minus, Some('=')) => MinusEq,
234 (Minus, Some('>')) => Arrow,
235 (Plus, Some('=')) => PlusEq,
236 (Rem, Some('=')) => RemEq,
237 (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
238 (Slash, Some('=')) => SlashEq,
239 (Slash, Some('/')) => return self.line_comment(),
240 (Star, Some('=')) => StarEq,
241 (Xor, Some('=')) => XorEq,
242 (Xor, Some('^')) => XorXor,
243 _ => return Ok(self.produce(tok)),
244 };
245
246 let tok = match (tok, self.consume().peek()) {
248 (HashBang, Some('/')) => return self.line_comment(),
249 (DotDot, Some('.')) => DotDotDot,
250 (DotDot, Some('=')) => DotDotEq,
251 (GtGt, Some('=')) => GtGtEq,
252 (LtLt, Some('=')) => LtLtEq,
253 _ => return Ok(self.produce(tok)),
254 };
255
256 Ok(self.consume().produce(tok))
257 }
258
259 pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> {
261 Ok(match self.skip_whitespace().peek() {
262 Some('}') => self.consume().produce(TKind::RCurly),
265 _ => self.produce(kind),
266 })
267 }
268
269 pub fn line_comment(&mut self) -> Result<Token, LexError> {
271 let kind = match self.consume().peek() {
272 Some('/') => TKind::OutDoc,
273 Some('!') => TKind::InDoc,
274 _ => TKind::Comment,
275 };
276 while self.consume().peek().is_some_and(|c| c != '\n') {}
277 let (lexeme, _) = self.as_str();
278 let lexeme = lexeme
279 .strip_prefix("///")
280 .or_else(|| lexeme.strip_prefix("//!"))
281 .map(|lexeme| lexeme.strip_prefix(" ").unwrap_or(lexeme))
282 .unwrap_or(lexeme);
283
284 Ok(self.produce_with_lexeme(kind, Lexeme::String(lexeme.into())))
285 }
286
287 pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
290 self.consume();
291 while let Some(c) = self.take() {
292 match (c, self.peek()) {
293 ('/', Some('*')) => self.block_comment()?,
294 ('*', Some('/')) => return Ok(self.consume()),
295 _ => continue,
296 };
297 }
298 Err(self.error(UnterminatedBlockComment))
299 }
300
301 pub fn identifier(&mut self) -> Result<Token, LexError> {
307 while self.consume().peek().is_some_and(is_xid_continue) {}
308 let (lexeme, _span) = self.as_str();
309 let token = self.produce(TKind::Identifier);
310 Ok(Token {
311 kind: match lexeme {
312 "as" => TKind::As,
313 "break" => TKind::Break,
314 "const" => TKind::Const,
315 "continue" => TKind::Continue,
316 "do" => TKind::Do,
317 "else" => TKind::Else,
318 "enum" => TKind::Enum,
319 "false" => TKind::False,
320 "fn" => TKind::Fn,
321 "for" => TKind::For,
322 "if" => TKind::If,
323 "impl" => TKind::Impl,
324 "in" => TKind::In,
325 "let" => TKind::Let,
326 "loop" => TKind::Loop,
327 "macro" => TKind::Macro,
328 "match" => TKind::Match,
329 "mod" => TKind::Mod,
330 "mut" => TKind::Mut,
331 "pub" => TKind::Pub,
332 "return" => TKind::Return,
333 "static" => TKind::Static,
334 "struct" => TKind::Struct,
335 "then" => TKind::Do,
336 "true" => TKind::True,
337 "type" => TKind::Type,
338 "use" => TKind::Use,
339 "while" => TKind::While,
340 _ => token.kind,
341 },
342 ..token
343 })
344 }
345
346 pub fn character(&mut self) -> Result<Token, LexError> {
348 let c = match self.consume().take() {
349 Some('\\') => self.escape()?,
350 Some(c) => c,
351 None => '\0',
352 };
353 if self.take().is_some_and(|c| c == '\'') {
354 Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
355 } else {
356 Err(self.error(UnterminatedCharacter))
357 }
358 }
359
360 pub fn string(&mut self) -> Result<Token, LexError> {
362 let mut lexeme = String::new();
363 self.consume();
364 loop {
365 lexeme.push(match self.take() {
366 None => Err(self.error(UnterminatedString))?,
367 Some('\\') => self.escape()?,
368 Some('"') => break,
369 Some(c) => c,
370 });
371 }
372 lexeme.shrink_to_fit();
373 Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
374 }
375
376 pub fn escape(&mut self) -> Result<char, LexError> {
378 Ok(
379 match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
380 ' ' => '\u{a0}', '0' => '\0', 'a' => '\x07', 'b' => '\x08', 'e' => '\x1b', 'f' => '\x0c', 'n' => '\n', 'r' => '\r', 't' => '\t', 'u' => self.unicode_escape()?,
390 'x' => self.hex_escape()?,
391 c => c,
392 },
393 )
394 }
395
396 pub fn hex_escape(&mut self) -> Result<char, LexError> {
398 let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
399 char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
400 }
401
402 pub fn unicode_escape(&mut self) -> Result<char, LexError> {
404 self.next_if('{')
405 .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
406 let mut out = 0;
407 while let Some(c) = self.take() {
408 if c == '}' {
409 return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
410 }
411 out = out.saturating_mul(16).saturating_add(
412 c.to_digit(16)
413 .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?,
414 );
415 }
416 Err(self.error(UnterminatedUnicodeEscape))
417 }
418
419 pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
424 let mut int: u128 = 0;
425 while let Some(c) = self.peek() {
426 int = match c.to_digit(BASE).ok_or(c) {
427 Err('_') => int,
428 Ok(c) => int
429 .checked_mul(BASE as _)
430 .and_then(|int| int.checked_add(c as _))
431 .ok_or_else(|| self.error(IntegerOverflow))?,
432 _ => break,
433 };
434 self.consume();
435 }
436
437 Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
438 }
439
440 pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
442 let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
443 if let Some(digit) = digit.to_digit(BASE) {
444 Ok(digit)
445 } else {
446 Err(self.error(InvalidDigitForBase(digit, BASE)))
447 }
448 }
449}