1pub use cl_structures::intern::interned::Symbol;
2
3use crate::error::{LexFailure::*, *};
4use cl_structures::span::Span;
5use cl_token::*;
6use std::{iter::Peekable, ops::Range, str::CharIndices};
7use unicode_ident::{is_xid_continue, is_xid_start};
8
9#[derive(Clone, Debug)]
10pub struct Lexer<'t> {
11 path: Symbol,
12 text: &'t str,
14 iter: Peekable<CharIndices<'t>>,
16 head: u32,
18 tail: u32,
20}
21
22impl<'t> Lexer<'t> {
23 pub fn new(path: Symbol, text: &'t str) -> Self {
25 let iter = text.char_indices().peekable();
26 Self { path, text, iter, head: 0, tail: 0 }
27 }
28
29 pub const fn span(&self) -> Span {
34 Span(self.path, self.head, self.tail)
35 }
36
37 pub fn peek(&mut self) -> Option<char> {
39 self.iter.peek().map(|&(_, c)| c)
40 }
41
42 fn advance_tail(&mut self) {
44 match self.iter.peek() {
45 Some(&(idx, _)) => self.tail = idx as u32,
46 None => self.tail = self.text.len() as _,
47 }
48 }
49
50 fn take(&mut self) -> Option<char> {
52 let (_, c) = self.iter.next()?;
53 self.advance_tail();
54 Some(c)
55 }
56
57 fn next_if(&mut self, expected: char) -> Option<char> {
59 let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
60 self.advance_tail();
61 Some(c)
62 }
63
64 fn consume(&mut self) -> &mut Self {
66 self.iter.next();
67 self.advance_tail();
68 self
69 }
70
71 const fn error(&self, res: LexFailure) -> LexError {
73 LexError { pos: self.span(), res }
74 }
75
76 fn as_str(&self) -> (&'t str, Span) {
78 let span = self.span();
79 (&self.text[Range::from(span)], span)
80 }
81
82 fn produce(&mut self, kind: TKind) -> Token {
84 self.advance_tail();
85 let (lexeme, span) = self.as_str();
86 self.head = self.tail;
87 Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
88 }
89
90 fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
91 self.advance_tail();
92 let span = self.span();
93 self.head = self.tail;
94 Token { lexeme, kind, span }
95 }
96
97 fn skip_whitespace(&mut self) -> &mut Self {
99 while self.peek().is_some_and(char::is_whitespace) {
100 let _ = self.consume();
101 }
102 self
103 }
104
105 const fn start_token(&mut self) -> &mut Self {
107 self.head = self.tail;
108 self
109 }
110
111 pub fn scan(&mut self) -> Result<Token, LexError> {
113 use TKind::*;
114 let tok = match self
116 .skip_whitespace()
117 .start_token()
118 .peek()
119 .ok_or_else(|| self.error(EOF))?
120 {
121 '!' => Bang,
122 '"' => return self.string(),
123 '#' => Hash,
124 '$' => Dollar,
125 '%' => Rem,
126 '&' => Amp,
127 '\'' => return self.character(false),
128 '(' => LParen,
129 ')' => RParen,
130 '*' => Star,
131 '+' => Plus,
132 ',' => Comma,
133 '-' => Minus,
134 '.' => Dot,
135 '/' => Slash,
136 '0' => Integer,
137 '1'..='9' => return self.digits::<10>(),
138 ':' => Colon,
139 ';' => Semi,
140 '<' => Lt,
141 '=' => Eq,
142 '>' => Gt,
143 '?' => Question,
144 '@' => At,
145 '[' => LBrack,
146 '\\' => Backslash,
147 ']' => RBrack,
148 '^' => Xor,
149 '`' => Grave,
150 '{' => LCurly,
151 '|' => Bar,
152 '}' => RCurly,
153 '~' => Tilde,
154 '_' => return self.identifier(),
155 'r' => Identifier, c if is_xid_start(c) => return self.identifier(),
157 c => Err(self.error(Unexpected(c)))?,
158 };
159
160 let tok = match (tok, self.consume().peek()) {
162 (Integer, Some('b')) => return self.consume().digits::<2>(),
163 (Integer, Some('d')) => return self.consume().digits::<10>(),
164 (Integer, Some('o')) => return self.consume().digits::<8>(),
165 (Integer, Some('x')) => return self.consume().digits::<16>(),
166 (Integer, Some('~')) => return self.consume().digits::<36>(),
167 (Integer, _) => return self.digits::<10>(),
168 (Identifier, Some('\'')) => return self.character(true),
169 (Identifier, Some('#' | '"')) => todo!("Raw strings!"),
170 (Identifier, Some(_)) => return self.identifier(),
171 (Amp, Some('&')) => AmpAmp,
172 (Amp, Some('=')) => AmpEq,
173 (Bang, Some('!')) => BangBang,
174 (Bang, Some('=')) => BangEq,
175 (Bar, Some('|')) => BarBar,
176 (Bar, Some('=')) => BarEq,
177 (Colon, Some(':')) => ColonColon,
178 (Dot, Some('.')) => DotDot,
179 (Eq, Some('=')) => EqEq,
180 (Eq, Some('>')) => FatArrow,
181 (Gt, Some('=')) => GtEq,
182 (Gt, Some('>')) => GtGt,
183 (Hash, Some('!')) => HashBang,
184 (Lt, Some('=')) => LtEq,
185 (Lt, Some('<')) => LtLt,
186 (Minus, Some('=')) => MinusEq,
187 (Minus, Some('>')) => Arrow,
188 (Plus, Some('=')) => PlusEq,
189 (Rem, Some('=')) => RemEq,
190 (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
191 (Slash, Some('=')) => SlashEq,
192 (Slash, Some('/')) => return self.line_comment(),
193 (Star, Some('=')) => StarEq,
194 (Xor, Some('=')) => XorEq,
195 (Xor, Some('^')) => XorXor,
196 _ => return Ok(self.produce(tok)),
197 };
198
199 let tok = match (tok, self.consume().peek()) {
201 (HashBang, Some('/')) => return self.line_comment(),
202 (DotDot, Some('.')) => DotDotDot,
203 (DotDot, Some('=')) => DotDotEq,
204 (GtGt, Some('=')) => GtGtEq,
205 (LtLt, Some('=')) => LtLtEq,
206 _ => return Ok(self.produce(tok)),
207 };
208
209 Ok(self.consume().produce(tok))
210 }
211
212 pub fn line_comment(&mut self) -> Result<Token, LexError> {
214 let kind = match self.consume().peek() {
215 Some('/') => TKind::OutDoc,
216 Some('!') => TKind::InDoc,
217 _ => TKind::Comment,
218 };
219 while self.consume().peek().is_some_and(|c| c != '\n') {}
220 let (lexeme, _) = self.as_str();
221 let lexeme = lexeme
222 .strip_prefix("///")
223 .or_else(|| lexeme.strip_prefix("//!"))
224 .map(|lexeme| lexeme.strip_prefix(" ").unwrap_or(lexeme))
225 .unwrap_or(lexeme);
226
227 Ok(self.produce_with_lexeme(kind, Lexeme::String(lexeme.into())))
228 }
229
230 pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
233 self.consume();
234 while let Some(c) = self.take() {
235 match (c, self.peek()) {
236 ('/', Some('*')) => self.block_comment()?,
237 ('*', Some('/')) => return Ok(self.consume()),
238 _ => continue,
239 };
240 }
241 Err(self.error(UnterminatedBlockComment))
242 }
243
244 pub fn identifier(&mut self) -> Result<Token, LexError> {
250 while self.consume().peek().is_some_and(is_xid_continue) {}
251 let (lexeme, _span) = self.as_str();
252 let token = self.produce(TKind::Identifier);
253 Ok(Token {
254 kind: match lexeme {
255 "_" => TKind::Underscore,
256 "as" => TKind::As,
257 "break" => TKind::Break,
258 "catch" => TKind::Catch,
259 "const" => TKind::Const,
260 "continue" => TKind::Continue,
261 "defer" => TKind::Defer,
262 "do" => TKind::Do,
263 "else" => TKind::Else,
264 "enum" => TKind::Enum,
265 "false" => TKind::False,
266 "fn" => TKind::Fn,
267 "for" => TKind::For,
268 "if" => TKind::If,
269 "impl" => TKind::Impl,
270 "in" => TKind::In,
271 "let" => TKind::Let,
272 "loop" => TKind::Loop,
273 "macro" => TKind::Macro,
274 "match" => TKind::Match,
275 "mod" => TKind::Mod,
276 "mut" => TKind::Mut,
277 "pub" => TKind::Pub,
278 "return" => TKind::Return,
279 "static" => TKind::Static,
280 "struct" => TKind::Struct,
281 "true" => TKind::True,
282 "try" => TKind::Try,
283 "type" => TKind::Type,
284 "use" => TKind::Use,
285 "while" => TKind::While,
286 _ => token.kind,
287 },
288 ..token
289 })
290 }
291
292 pub fn character(&mut self, as_int: bool) -> Result<Token, LexError> {
294 let c = match self.consume().take() {
295 Some('\\') => self.escape()?,
296 Some(c) => c,
297 None => '\0',
298 };
299 if !self.take().is_some_and(|c| c == '\'') {
300 return Err(self.error(UnterminatedCharacter));
301 }
302 let (kind, lexeme) = match as_int {
303 true => (TKind::Integer, Lexeme::Integer(c as _, 16)),
304 false => (TKind::Character, Lexeme::Char(c)),
305 };
306 Ok(self.produce_with_lexeme(kind, lexeme))
307 }
308
309 pub fn string(&mut self) -> Result<Token, LexError> {
311 let mut lexeme = String::new();
312 self.consume();
313 loop {
314 lexeme.push(match self.take() {
315 None => Err(self.error(UnterminatedString))?,
316 Some('\\') => self.escape()?,
317 Some('"') => break,
318 Some(c) => c,
319 });
320 }
321 lexeme.shrink_to_fit();
322 Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
323 }
324
325 pub fn escape(&mut self) -> Result<char, LexError> {
327 Ok(
328 match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
329 ' ' => '\u{a0}', '0' => '\0', 'a' => '\x07', 'b' => '\x08', 'e' => '\x1b', 'f' => '\x0c', 'n' => '\n', 'r' => '\r', 't' => '\t', 'u' => self.unicode_escape()?,
339 'x' => self.hex_escape()?,
340 c => c,
341 },
342 )
343 }
344
345 pub fn hex_escape(&mut self) -> Result<char, LexError> {
347 let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
348 char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
349 }
350
351 pub fn unicode_escape(&mut self) -> Result<char, LexError> {
353 self.next_if('{')
354 .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
355 let mut out = 0;
356 while let Some(c) = self.take() {
357 if c == '}' {
358 return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
359 }
360 out = out.saturating_mul(16).saturating_add(
361 c.to_digit(16)
362 .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?,
363 );
364 }
365 Err(self.error(UnterminatedUnicodeEscape))
366 }
367
368 pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
373 let mut int: u128 = 0;
374 while let Some(c) = self.peek() {
375 int = match c.to_digit(BASE).ok_or(c) {
376 Err('_') => int,
377 Ok(c) => int
378 .checked_mul(BASE as _)
379 .and_then(|int| int.checked_add(c as _))
380 .ok_or_else(|| self.error(IntegerOverflow))?,
381 _ => break,
382 };
383 self.consume();
384 }
385
386 Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
387 }
388
389 pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
391 let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
392 if let Some(digit) = digit.to_digit(BASE) {
393 Ok(digit)
394 } else {
395 Err(self.error(InvalidDigitForBase(digit, BASE)))
396 }
397 }
398}