Skip to main content

cl_lexer/
lexer.rs

1pub use cl_structures::intern::interned::Symbol;
2
3use crate::error::{LexFailure::*, *};
4use cl_structures::span::Span;
5use cl_token::*;
6use std::{iter::Peekable, ops::Range, str::CharIndices};
7use unicode_ident::{is_xid_continue, is_xid_start};
8
9#[derive(Clone, Debug)]
10pub struct Lexer<'t> {
11    path: Symbol,
12    /// The source text
13    text: &'t str,
14    /// A peekable iterator over the source text
15    iter: Peekable<CharIndices<'t>>,
16    /// The start of the current token
17    head: u32,
18    /// The end of the current token
19    tail: u32,
20}
21
22impl<'t> Lexer<'t> {
23    /// Constructs a new Lexer from some text
24    pub fn new(path: Symbol, text: &'t str) -> Self {
25        let iter = text.char_indices().peekable();
26        Self { path, text, iter, head: 0, tail: 0 }
27    }
28
29    /// Gets the [struct@Span] of the current token.
30    ///
31    /// When called from outside [Lexer::scan], this will return
32    /// a zero-length [struct@Span] marking the current lexer location.
33    pub const fn span(&self) -> Span {
34        Span(self.path, self.head, self.tail)
35    }
36
37    /// Peeks the next character without advancing the lexer
38    pub fn peek(&mut self) -> Option<char> {
39        self.iter.peek().map(|&(_, c)| c)
40    }
41
42    /// Advances the tail to the current character index
43    fn advance_tail(&mut self) {
44        match self.iter.peek() {
45            Some(&(idx, _)) => self.tail = idx as u32,
46            None => self.tail = self.text.len() as _,
47        }
48    }
49
50    /// Takes the last character
51    fn take(&mut self) -> Option<char> {
52        let (_, c) = self.iter.next()?;
53        self.advance_tail();
54        Some(c)
55    }
56
57    /// Takes the next character if it matches `expected`, else returns [`None`].
58    fn next_if(&mut self, expected: char) -> Option<char> {
59        let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
60        self.advance_tail();
61        Some(c)
62    }
63
64    /// Consumes the last-peeked character, advancing the tail
65    fn consume(&mut self) -> &mut Self {
66        self.iter.next();
67        self.advance_tail();
68        self
69    }
70
71    /// Produces a [`LexError`] at the start of the current token
72    const fn error(&self, res: LexFailure) -> LexError {
73        LexError { pos: self.span(), res }
74    }
75
76    /// Gets the Lexer's current &[str] lexeme and [struct@Span]
77    fn as_str(&self) -> (&'t str, Span) {
78        let span = self.span();
79        (&self.text[Range::from(span)], span)
80    }
81
82    /// Produces a Token
83    fn produce(&mut self, kind: TKind) -> Token {
84        self.advance_tail();
85        let (lexeme, span) = self.as_str();
86        self.head = self.tail;
87        Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
88    }
89
90    fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
91        self.advance_tail();
92        let span = self.span();
93        self.head = self.tail;
94        Token { lexeme, kind, span }
95    }
96
97    /// Consumes 0 or more whitespace
98    fn skip_whitespace(&mut self) -> &mut Self {
99        while self.peek().is_some_and(char::is_whitespace) {
100            let _ = self.consume();
101        }
102        self
103    }
104
105    /// Marks the start of a new [Token]'s [struct@Span]
106    const fn start_token(&mut self) -> &mut Self {
107        self.head = self.tail;
108        self
109    }
110
111    /// Scans forward until it finds the next Token in the input
112    pub fn scan(&mut self) -> Result<Token, LexError> {
113        use TKind::*;
114        // !"#%&'()*+,-./:;<=>?@[\\]^`{|}~
115        let tok = match self
116            .skip_whitespace()
117            .start_token()
118            .peek()
119            .ok_or_else(|| self.error(EOF))?
120        {
121            '!' => Bang,
122            '"' => return self.string(),
123            '#' => Hash,
124            '$' => Dollar,
125            '%' => Rem,
126            '&' => Amp,
127            '\'' => return self.character(false),
128            '(' => LParen,
129            ')' => RParen,
130            '*' => Star,
131            '+' => Plus,
132            ',' => Comma,
133            '-' => Minus,
134            '.' => Dot,
135            '/' => Slash,
136            '0' => Integer,
137            '1'..='9' => return self.digits::<10>(),
138            ':' => Colon,
139            ';' => Semi,
140            '<' => Lt,
141            '=' => Eq,
142            '>' => Gt,
143            '?' => Question,
144            '@' => At,
145            '[' => LBrack,
146            '\\' => Backslash,
147            ']' => RBrack,
148            '^' => Xor,
149            '`' => Grave,
150            '{' => LCurly,
151            '|' => Bar,
152            '}' => RCurly,
153            '~' => Tilde,
154            '_' => return self.identifier(),
155            'r' => Identifier, // "Raw" string/character
156            c if is_xid_start(c) => return self.identifier(),
157            c => Err(self.error(Unexpected(c)))?,
158        };
159
160        // Handle digraphs
161        let tok = match (tok, self.consume().peek()) {
162            (Integer, Some('b')) => return self.consume().digits::<2>(),
163            (Integer, Some('d')) => return self.consume().digits::<10>(),
164            (Integer, Some('o')) => return self.consume().digits::<8>(),
165            (Integer, Some('x')) => return self.consume().digits::<16>(),
166            (Integer, Some('~')) => return self.consume().digits::<36>(),
167            (Integer, _) => return self.digits::<10>(),
168            (Identifier, Some('\'')) => return self.character(true),
169            (Identifier, Some('#' | '"')) => todo!("Raw strings!"),
170            (Identifier, Some(_)) => return self.identifier(),
171            (Amp, Some('&')) => AmpAmp,
172            (Amp, Some('=')) => AmpEq,
173            (Bang, Some('!')) => BangBang,
174            (Bang, Some('=')) => BangEq,
175            (Bar, Some('|')) => BarBar,
176            (Bar, Some('=')) => BarEq,
177            (Colon, Some(':')) => ColonColon,
178            (Dot, Some('.')) => DotDot,
179            (Eq, Some('=')) => EqEq,
180            (Eq, Some('>')) => FatArrow,
181            (Gt, Some('=')) => GtEq,
182            (Gt, Some('>')) => GtGt,
183            (Hash, Some('!')) => HashBang,
184            (Lt, Some('=')) => LtEq,
185            (Lt, Some('<')) => LtLt,
186            (Minus, Some('=')) => MinusEq,
187            (Minus, Some('>')) => Arrow,
188            (Plus, Some('=')) => PlusEq,
189            (Rem, Some('=')) => RemEq,
190            (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
191            (Slash, Some('=')) => SlashEq,
192            (Slash, Some('/')) => return self.line_comment(),
193            (Star, Some('=')) => StarEq,
194            (Xor, Some('=')) => XorEq,
195            (Xor, Some('^')) => XorXor,
196            _ => return Ok(self.produce(tok)),
197        };
198
199        // Handle trigraphs
200        let tok = match (tok, self.consume().peek()) {
201            (HashBang, Some('/')) => return self.line_comment(),
202            (DotDot, Some('.')) => DotDotDot,
203            (DotDot, Some('=')) => DotDotEq,
204            (GtGt, Some('=')) => GtGtEq,
205            (LtLt, Some('=')) => LtLtEq,
206            _ => return Ok(self.produce(tok)),
207        };
208
209        Ok(self.consume().produce(tok))
210    }
211
212    /// Consumes characters until the lexer reaches a newline `'\n'`
213    pub fn line_comment(&mut self) -> Result<Token, LexError> {
214        let kind = match self.consume().peek() {
215            Some('/') => TKind::OutDoc,
216            Some('!') => TKind::InDoc,
217            _ => TKind::Comment,
218        };
219        while self.consume().peek().is_some_and(|c| c != '\n') {}
220        let (lexeme, _) = self.as_str();
221        let lexeme = lexeme
222            .strip_prefix("///")
223            .or_else(|| lexeme.strip_prefix("//!"))
224            .map(|lexeme| lexeme.strip_prefix(" ").unwrap_or(lexeme))
225            .unwrap_or(lexeme);
226
227        Ok(self.produce_with_lexeme(kind, Lexeme::String(lexeme.into())))
228    }
229
230    /// Consumes characters until the lexer reaches the end of a *nested* block comment.
231    /// This allows you to arbitrarily comment out code, even if that code has a block comment.
232    pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
233        self.consume();
234        while let Some(c) = self.take() {
235            match (c, self.peek()) {
236                ('/', Some('*')) => self.block_comment()?,
237                ('*', Some('/')) => return Ok(self.consume()),
238                _ => continue,
239            };
240        }
241        Err(self.error(UnterminatedBlockComment))
242    }
243
244    /// Consumes characters until it reaches a character not in [`is_xid_continue`].
245    ///
246    /// Always consumes the first character.
247    ///
248    /// Maps the result to either a [`TKind::Identifier`] or a [`TKind`] keyword.
249    pub fn identifier(&mut self) -> Result<Token, LexError> {
250        while self.consume().peek().is_some_and(is_xid_continue) {}
251        let (lexeme, _span) = self.as_str();
252        let token = self.produce(TKind::Identifier);
253        Ok(Token {
254            kind: match lexeme {
255                "_" => TKind::Underscore,
256                "as" => TKind::As,
257                "break" => TKind::Break,
258                "catch" => TKind::Catch,
259                "const" => TKind::Const,
260                "continue" => TKind::Continue,
261                "defer" => TKind::Defer,
262                "do" => TKind::Do,
263                "else" => TKind::Else,
264                "enum" => TKind::Enum,
265                "false" => TKind::False,
266                "fn" => TKind::Fn,
267                "for" => TKind::For,
268                "if" => TKind::If,
269                "impl" => TKind::Impl,
270                "in" => TKind::In,
271                "let" => TKind::Let,
272                "loop" => TKind::Loop,
273                "macro" => TKind::Macro,
274                "match" => TKind::Match,
275                "mod" => TKind::Mod,
276                "mut" => TKind::Mut,
277                "pub" => TKind::Pub,
278                "return" => TKind::Return,
279                "static" => TKind::Static,
280                "struct" => TKind::Struct,
281                "true" => TKind::True,
282                "try" => TKind::Try,
283                "type" => TKind::Type,
284                "use" => TKind::Use,
285                "while" => TKind::While,
286                _ => token.kind,
287            },
288            ..token
289        })
290    }
291
292    /// Eagerly parses a character literal starting at the current lexer position.
293    pub fn character(&mut self, as_int: bool) -> Result<Token, LexError> {
294        let c = match self.consume().take() {
295            Some('\\') => self.escape()?,
296            Some(c) => c,
297            None => '\0',
298        };
299        if !self.take().is_some_and(|c| c == '\'') {
300            return Err(self.error(UnterminatedCharacter));
301        }
302        let (kind, lexeme) = match as_int {
303            true => (TKind::Integer, Lexeme::Integer(c as _, 16)),
304            false => (TKind::Character, Lexeme::Char(c)),
305        };
306        Ok(self.produce_with_lexeme(kind, lexeme))
307    }
308
309    // Eagerly parses a string literal starting at the current lexer position.
310    pub fn string(&mut self) -> Result<Token, LexError> {
311        let mut lexeme = String::new();
312        self.consume();
313        loop {
314            lexeme.push(match self.take() {
315                None => Err(self.error(UnterminatedString))?,
316                Some('\\') => self.escape()?,
317                Some('"') => break,
318                Some(c) => c,
319            });
320        }
321        lexeme.shrink_to_fit();
322        Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
323    }
324
325    /// Parses a single escape sequence into its resulting char value.
326    pub fn escape(&mut self) -> Result<char, LexError> {
327        Ok(
328            match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
329                ' ' => '\u{a0}', // Non-breaking space
330                '0' => '\0',     // C0 Null Character
331                'a' => '\x07',   // C0 Acknowledge
332                'b' => '\x08',   // C0 Bell
333                'e' => '\x1b',   // C0 Escape
334                'f' => '\x0c',   // Form Feed
335                'n' => '\n',     // New Line
336                'r' => '\r',     // Carriage Return
337                't' => '\t',     // Tab
338                'u' => self.unicode_escape()?,
339                'x' => self.hex_escape()?,
340                c => c,
341            },
342        )
343    }
344
345    /// Parses two hex-digits and constructs a [char] out of them.
346    pub fn hex_escape(&mut self) -> Result<char, LexError> {
347        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
348        char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
349    }
350
351    /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
352    pub fn unicode_escape(&mut self) -> Result<char, LexError> {
353        self.next_if('{')
354            .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
355        let mut out = 0;
356        while let Some(c) = self.take() {
357            if c == '}' {
358                return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
359            }
360            out = out.saturating_mul(16).saturating_add(
361                c.to_digit(16)
362                    .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?,
363            );
364        }
365        Err(self.error(UnterminatedUnicodeEscape))
366    }
367
368    /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
369    ///
370    /// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap
371    /// around 2^128.
372    pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
373        let mut int: u128 = 0;
374        while let Some(c) = self.peek() {
375            int = match c.to_digit(BASE).ok_or(c) {
376                Err('_') => int,
377                Ok(c) => int
378                    .checked_mul(BASE as _)
379                    .and_then(|int| int.checked_add(c as _))
380                    .ok_or_else(|| self.error(IntegerOverflow))?,
381                _ => break,
382            };
383            self.consume();
384        }
385
386        Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
387    }
388
389    /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36.
390    pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
391        let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
392        if let Some(digit) = digit.to_digit(BASE) {
393            Ok(digit)
394        } else {
395            Err(self.error(InvalidDigitForBase(digit, BASE)))
396        }
397    }
398}