Skip to main content

cl_lexer/
lib.rs

1//! A lobster
2use std::{iter::Peekable, ops::Range, str::CharIndices};
3use unicode_ident::{is_xid_continue, is_xid_start};
4
5use cl_structures::span::Span;
6use cl_token::*;
7
8pub use cl_structures::intern::interned::Symbol;
9
10#[derive(Clone, Copy, Debug, PartialEq, Eq)]
11pub struct LexError {
12    pub pos: Span,
13    pub res: LexFailure,
14}
15
16impl std::error::Error for LexError {}
17impl std::fmt::Display for LexError {
18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19        let Self { pos, res } = self;
20        write!(f, "{pos}: {res}")
21    }
22}
23
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25pub enum LexFailure {
26    /// Reached end of file
27    EOF,
28    UnexpectedEOF,
29    Unexpected(char),
30    UnterminatedBlockComment,
31    UnterminatedCharacter,
32    UnterminatedString,
33    UnterminatedUnicodeEscape,
34    InvalidUnicodeEscape(u32),
35    InvalidDigitForBase(char, u32),
36    IntegerOverflow,
37}
38use LexFailure::*;
39pub use LexFailure::{EOF, UnexpectedEOF};
40
41impl std::fmt::Display for LexFailure {
42    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43        match self {
44            Self::EOF => "EOF".fmt(f),
45            Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
46            Self::Unexpected(c) => write!(f, "Character {c:?}"),
47            Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
48            Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
49            Self::UnterminatedString => "Unterminated String".fmt(f),
50            Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
51            Self::InvalidUnicodeEscape(hex) => {
52                write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
53            }
54            Self::InvalidDigitForBase(digit, base) => {
55                write!(f, "Invalid digit {digit} for base {base}")
56            }
57            Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
58        }
59    }
60}
61
62#[derive(Clone, Debug)]
63pub struct Lexer<'t> {
64    path: Symbol,
65    /// The source text
66    text: &'t str,
67    /// A peekable iterator over the source text
68    iter: Peekable<CharIndices<'t>>,
69    /// The start of the current token
70    head: u32,
71    /// The end of the current token
72    tail: u32,
73}
74
75impl<'t> Lexer<'t> {
76    /// Constructs a new Lexer from some text
77    pub fn new(path: Symbol, text: &'t str) -> Self {
78        let iter = text.char_indices().peekable();
79        Self { path, text, iter, head: 0, tail: 0 }
80    }
81
82    /// Gets the [struct@Span] of the current token.
83    ///
84    /// When called from outside [Lexer::scan], this will return
85    /// a zero-sized span marking the current lexer location.
86    pub const fn span(&self) -> Span {
87        Span(self.path, self.head, self.tail)
88    }
89
90    /// Peeks the next character without advancing the lexer
91    pub fn peek(&mut self) -> Option<char> {
92        self.iter.peek().map(|&(_, c)| c)
93    }
94
95    /// Advances the tail to the current character index
96    fn advance_tail(&mut self) {
97        match self.iter.peek() {
98            Some(&(idx, _)) => self.tail = idx as u32,
99            None => self.tail = self.text.len() as _,
100        }
101    }
102
103    /// Takes the last character
104    fn take(&mut self) -> Option<char> {
105        let (_, c) = self.iter.next()?;
106        self.advance_tail();
107        Some(c)
108    }
109
110    fn next_if(&mut self, expected: char) -> Option<char> {
111        let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
112        self.advance_tail();
113        Some(c)
114    }
115
116    /// Consumes the last-peeked character, advancing the tail
117    fn consume(&mut self) -> &mut Self {
118        self.iter.next();
119        self.advance_tail();
120        self
121    }
122
123    /// Produces a [`LexError`] at the start of the current token
124    const fn error(&self, res: LexFailure) -> LexError {
125        LexError { pos: self.span(), res }
126    }
127
128    /// Gets the Lexer's current &[str] lexeme and [struct@Span]
129    fn as_str(&self) -> (&'t str, Span) {
130        let span = self.span();
131        (&self.text[Range::from(span)], span)
132    }
133
134    /// Produces a Token
135    fn produce(&mut self, kind: TKind) -> Token {
136        self.advance_tail();
137        let (lexeme, span) = self.as_str();
138        self.head = self.tail;
139        Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
140    }
141
142    fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
143        self.advance_tail();
144        let span = self.span();
145        self.head = self.tail;
146        Token { lexeme, kind, span }
147    }
148
149    /// Consumes 0 or more whitespace
150    fn skip_whitespace(&mut self) -> &mut Self {
151        while self.peek().is_some_and(char::is_whitespace) {
152            let _ = self.consume();
153        }
154        self
155    }
156
157    const fn start_token(&mut self) -> &mut Self {
158        self.head = self.tail;
159        self
160    }
161
162    /// Scans forward until it finds the next Token in the input
163    pub fn scan(&mut self) -> Result<Token, LexError> {
164        use TKind::*;
165        // !"#%&'()*+,-./:;<=>?@[\\]^`{|}~
166        let tok = match self
167            .skip_whitespace()
168            .start_token()
169            .peek()
170            .ok_or_else(|| self.error(EOF))?
171        {
172            '!' => Bang,
173            '"' => return self.string(),
174            '#' => Hash,
175            '$' => Dollar,
176            '%' => Rem,
177            '&' => Amp,
178            '\'' => return self.character(),
179            '(' => LParen,
180            ')' => RParen,
181            '*' => Star,
182            '+' => Plus,
183            ',' => return self.consume().trailing(Comma),
184            '-' => Minus,
185            '.' => Dot,
186            '/' => Slash,
187            '0' => Integer,
188            '1'..='9' => return self.digits::<10>(),
189            ':' => Colon,
190            ';' => Semi,
191            '<' => Lt,
192            '=' => Eq,
193            '>' => Gt,
194            '?' => Question,
195            '@' => At,
196            '[' => LBrack,
197            '\\' => Backslash,
198            ']' => RBrack,
199            '^' => Xor,
200            '`' => Grave,
201            '{' => LCurly,
202            '|' => Bar,
203            '}' => RCurly,
204            '~' => Tilde,
205            '_' => return self.identifier(),
206            c if is_xid_start(c) => return self.identifier(),
207            c => Err(self.error(Unexpected(c)))?,
208        };
209
210        // Handle digraphs
211        let tok = match (tok, self.consume().peek()) {
212            (Integer, Some('b')) => return self.consume().digits::<2>(),
213            (Integer, Some('d')) => return self.consume().digits::<10>(),
214            (Integer, Some('o')) => return self.consume().digits::<8>(),
215            (Integer, Some('x')) => return self.consume().digits::<16>(),
216            (Integer, Some('~')) => return self.consume().digits::<36>(),
217            (Integer, _) => return self.digits::<10>(),
218            (Amp, Some('&')) => AmpAmp,
219            (Amp, Some('=')) => AmpEq,
220            (Bang, Some('!')) => BangBang,
221            (Bang, Some('=')) => BangEq,
222            (Bar, Some('|')) => BarBar,
223            (Bar, Some('=')) => BarEq,
224            (Colon, Some(':')) => ColonColon,
225            (Dot, Some('.')) => DotDot,
226            (Eq, Some('=')) => EqEq,
227            (Eq, Some('>')) => FatArrow,
228            (Gt, Some('=')) => GtEq,
229            (Gt, Some('>')) => GtGt,
230            (Hash, Some('!')) => HashBang,
231            (Lt, Some('=')) => LtEq,
232            (Lt, Some('<')) => LtLt,
233            (Minus, Some('=')) => MinusEq,
234            (Minus, Some('>')) => Arrow,
235            (Plus, Some('=')) => PlusEq,
236            (Rem, Some('=')) => RemEq,
237            (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
238            (Slash, Some('=')) => SlashEq,
239            (Slash, Some('/')) => return self.line_comment(),
240            (Star, Some('=')) => StarEq,
241            (Xor, Some('=')) => XorEq,
242            (Xor, Some('^')) => XorXor,
243            _ => return Ok(self.produce(tok)),
244        };
245
246        // Handle trigraphs
247        let tok = match (tok, self.consume().peek()) {
248            (HashBang, Some('/')) => return self.line_comment(),
249            (DotDot, Some('.')) => DotDotDot,
250            (DotDot, Some('=')) => DotDotEq,
251            (GtGt, Some('=')) => GtGtEq,
252            (LtLt, Some('=')) => LtLtEq,
253            _ => return Ok(self.produce(tok)),
254        };
255
256        Ok(self.consume().produce(tok))
257    }
258
259    /// Elides the trailing [Token] `kind` when it comes before a list terminator.
260    pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> {
261        Ok(match self.skip_whitespace().peek() {
262            // Some(')') => self.consume().produce(TKind::RParen), // maybe.
263            // Some(']') => self.consume().produce(TKind::RBrack),
264            Some('}') => self.consume().produce(TKind::RCurly),
265            _ => self.produce(kind),
266        })
267    }
268
269    /// Consumes characters until the lexer reaches a newline `'\n'`
270    pub fn line_comment(&mut self) -> Result<Token, LexError> {
271        let kind = match self.consume().peek() {
272            Some('/') => TKind::OutDoc,
273            Some('!') => TKind::InDoc,
274            _ => TKind::Comment,
275        };
276        while self.consume().peek().is_some_and(|c| c != '\n') {}
277        let (lexeme, _) = self.as_str();
278        let lexeme = lexeme
279            .strip_prefix("///")
280            .or_else(|| lexeme.strip_prefix("//!"))
281            .map(|lexeme| lexeme.strip_prefix(" ").unwrap_or(lexeme))
282            .unwrap_or(lexeme);
283
284        Ok(self.produce_with_lexeme(kind, Lexeme::String(lexeme.into())))
285    }
286
287    /// Consumes characters until the lexer reaches the end of a *nested* block comment.
288    /// This allows you to arbitrarily comment out code, even if that code has a block comment.
289    pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
290        self.consume();
291        while let Some(c) = self.take() {
292            match (c, self.peek()) {
293                ('/', Some('*')) => self.block_comment()?,
294                ('*', Some('/')) => return Ok(self.consume()),
295                _ => continue,
296            };
297        }
298        Err(self.error(UnterminatedBlockComment))
299    }
300
301    /// Consumes characters until it reaches a character not in [`is_xid_continue`].
302    ///
303    /// Always consumes the first character.
304    ///
305    /// Maps the result to either a [`TKind::Identifier`] or a [`TKind`] keyword.
306    pub fn identifier(&mut self) -> Result<Token, LexError> {
307        while self.consume().peek().is_some_and(is_xid_continue) {}
308        let (lexeme, _span) = self.as_str();
309        let token = self.produce(TKind::Identifier);
310        Ok(Token {
311            kind: match lexeme {
312                "as" => TKind::As,
313                "break" => TKind::Break,
314                "const" => TKind::Const,
315                "continue" => TKind::Continue,
316                "do" => TKind::Do,
317                "else" => TKind::Else,
318                "enum" => TKind::Enum,
319                "false" => TKind::False,
320                "fn" => TKind::Fn,
321                "for" => TKind::For,
322                "if" => TKind::If,
323                "impl" => TKind::Impl,
324                "in" => TKind::In,
325                "let" => TKind::Let,
326                "loop" => TKind::Loop,
327                "macro" => TKind::Macro,
328                "match" => TKind::Match,
329                "mod" => TKind::Mod,
330                "mut" => TKind::Mut,
331                "pub" => TKind::Pub,
332                "return" => TKind::Return,
333                "static" => TKind::Static,
334                "struct" => TKind::Struct,
335                "then" => TKind::Do,
336                "true" => TKind::True,
337                "type" => TKind::Type,
338                "use" => TKind::Use,
339                "while" => TKind::While,
340                _ => token.kind,
341            },
342            ..token
343        })
344    }
345
346    /// Eagerly parses a character literal starting at the current lexer position.
347    pub fn character(&mut self) -> Result<Token, LexError> {
348        let c = match self.consume().take() {
349            Some('\\') => self.escape()?,
350            Some(c) => c,
351            None => '\0',
352        };
353        if self.take().is_some_and(|c| c == '\'') {
354            Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
355        } else {
356            Err(self.error(UnterminatedCharacter))
357        }
358    }
359
360    // Eagerly parses a string literal starting at the current lexer position.
361    pub fn string(&mut self) -> Result<Token, LexError> {
362        let mut lexeme = String::new();
363        self.consume();
364        loop {
365            lexeme.push(match self.take() {
366                None => Err(self.error(UnterminatedString))?,
367                Some('\\') => self.escape()?,
368                Some('"') => break,
369                Some(c) => c,
370            });
371        }
372        lexeme.shrink_to_fit();
373        Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
374    }
375
376    /// Parses a single escape sequence into its resulting char value.
377    pub fn escape(&mut self) -> Result<char, LexError> {
378        Ok(
379            match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
380                ' ' => '\u{a0}', // Non-breaking space
381                '0' => '\0',     // C0 Null Character
382                'a' => '\x07',   // C0 Acknowledge
383                'b' => '\x08',   // C0 Bell
384                'e' => '\x1b',   // C0 Escape
385                'f' => '\x0c',   // Form Feed
386                'n' => '\n',     // New Line
387                'r' => '\r',     // Carriage Return
388                't' => '\t',     // Tab
389                'u' => self.unicode_escape()?,
390                'x' => self.hex_escape()?,
391                c => c,
392            },
393        )
394    }
395
396    /// Parses two hex-digits and constructs a [char] out of them.
397    pub fn hex_escape(&mut self) -> Result<char, LexError> {
398        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
399        char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
400    }
401
402    /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
403    pub fn unicode_escape(&mut self) -> Result<char, LexError> {
404        self.next_if('{')
405            .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
406        let mut out = 0;
407        while let Some(c) = self.take() {
408            if c == '}' {
409                return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
410            }
411            out = out.saturating_mul(16).saturating_add(
412                c.to_digit(16)
413                    .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?,
414            );
415        }
416        Err(self.error(UnterminatedUnicodeEscape))
417    }
418
419    /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
420    ///
421    /// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap
422    /// around 2^128.
423    pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
424        let mut int: u128 = 0;
425        while let Some(c) = self.peek() {
426            int = match c.to_digit(BASE).ok_or(c) {
427                Err('_') => int,
428                Ok(c) => int
429                    .checked_mul(BASE as _)
430                    .and_then(|int| int.checked_add(c as _))
431                    .ok_or_else(|| self.error(IntegerOverflow))?,
432                _ => break,
433            };
434            self.consume();
435        }
436
437        Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
438    }
439
440    /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36.
441    pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
442        let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
443        if let Some(digit) = digit.to_digit(BASE) {
444            Ok(digit)
445        } else {
446            Err(self.error(InvalidDigitForBase(digit, BASE)))
447        }
448    }
449}