Skip to main content

cl_lexer/
lib.rs

1//! A lobster
2use std::{iter::Peekable, ops::Range, str::CharIndices};
3use unicode_ident::{is_xid_continue, is_xid_start};
4
5use cl_structures::span::Span;
6use cl_token::*;
7
8pub use cl_structures::intern::interned::Symbol;
9
10#[derive(Clone, Copy, Debug, PartialEq, Eq)]
11pub struct LexError {
12    pub pos: Span,
13    pub res: LexFailure,
14}
15
16impl std::error::Error for LexError {}
17impl std::fmt::Display for LexError {
18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19        let Self { pos, res } = self;
20        write!(f, "{pos}: {res}")
21    }
22}
23
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25pub enum LexFailure {
26    /// Reached end of file
27    EOF,
28    UnexpectedEOF,
29    Unexpected(char),
30    UnterminatedBlockComment,
31    UnterminatedCharacter,
32    UnterminatedString,
33    UnterminatedUnicodeEscape,
34    InvalidUnicodeEscape(u32),
35    InvalidDigitForBase(char, u32),
36    IntegerOverflow,
37}
38use LexFailure::*;
39pub use LexFailure::{EOF, UnexpectedEOF};
40
41impl std::fmt::Display for LexFailure {
42    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43        match self {
44            Self::EOF => "EOF".fmt(f),
45            Self::UnexpectedEOF => "Unexpected EOF".fmt(f),
46            Self::Unexpected(c) => write!(f, "Character {c:?}"),
47            Self::UnterminatedBlockComment => "Unterminated Block Comment".fmt(f),
48            Self::UnterminatedCharacter => "Unterminated Character".fmt(f),
49            Self::UnterminatedString => "Unterminated String".fmt(f),
50            Self::UnterminatedUnicodeEscape => "Unterminated Unicode Escape".fmt(f),
51            Self::InvalidUnicodeEscape(hex) => {
52                write!(f, "'\\u{{{hex:x}}}' is not a valid UTF-8 codepoint")
53            }
54            Self::InvalidDigitForBase(digit, base) => {
55                write!(f, "Invalid digit {digit} for base {base}")
56            }
57            Self::IntegerOverflow => "Integer literal does not fit in 128 bits".fmt(f),
58        }
59    }
60}
61
62#[derive(Clone, Debug)]
63pub struct Lexer<'t> {
64    path: Symbol,
65    /// The source text
66    text: &'t str,
67    /// A peekable iterator over the source text
68    iter: Peekable<CharIndices<'t>>,
69    /// The start of the current token
70    head: u32,
71    /// The end of the current token
72    tail: u32,
73}
74
75impl<'t> Lexer<'t> {
76    /// Constructs a new Lexer from some text
77    pub fn new(path: Symbol, text: &'t str) -> Self {
78        let iter = text.char_indices().peekable();
79        Self { path, text, iter, head: 0, tail: 0 }
80    }
81
82    /// Gets the [struct@Span] of the current token.
83    ///
84    /// When called from outside [Lexer::scan], this will return
85    /// a zero-sized span marking the current lexer location.
86    pub const fn span(&self) -> Span {
87        Span(self.path, self.head, self.tail)
88    }
89
90    /// Peeks the next character without advancing the lexer
91    pub fn peek(&mut self) -> Option<char> {
92        self.iter.peek().map(|&(_, c)| c)
93    }
94
95    /// Advances the tail to the current character index
96    fn advance_tail(&mut self) {
97        match self.iter.peek() {
98            Some(&(idx, _)) => self.tail = idx as u32,
99            None => self.tail = self.text.len() as _,
100        }
101    }
102
103    /// Takes the last character
104    fn take(&mut self) -> Option<char> {
105        let (_, c) = self.iter.next()?;
106        self.advance_tail();
107        Some(c)
108    }
109
110    fn next_if(&mut self, expected: char) -> Option<char> {
111        let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
112        self.advance_tail();
113        Some(c)
114    }
115
116    /// Consumes the last-peeked character, advancing the tail
117    fn consume(&mut self) -> &mut Self {
118        self.iter.next();
119        self.advance_tail();
120        self
121    }
122
123    /// Produces a [`LexError`] at the start of the current token
124    const fn error(&self, res: LexFailure) -> LexError {
125        LexError { pos: self.span(), res }
126    }
127
128    /// Gets the Lexer's current &[str] lexeme and [struct@Span]
129    fn as_str(&self) -> (&'t str, Span) {
130        let span = self.span();
131        (&self.text[Range::from(span)], span)
132    }
133
134    /// Produces a Token
135    fn produce(&mut self, kind: TKind) -> Token {
136        self.advance_tail();
137        let (lexeme, span) = self.as_str();
138        self.head = self.tail;
139        Token { lexeme: Lexeme::String(lexeme.to_owned()), kind, span }
140    }
141
142    fn produce_with_lexeme(&mut self, kind: TKind, lexeme: Lexeme) -> Token {
143        self.advance_tail();
144        let span = self.span();
145        self.head = self.tail;
146        Token { lexeme, kind, span }
147    }
148
149    /// Consumes 0 or more whitespace
150    fn skip_whitespace(&mut self) -> &mut Self {
151        while self.peek().is_some_and(char::is_whitespace) {
152            let _ = self.consume();
153        }
154        self
155    }
156
157    const fn start_token(&mut self) -> &mut Self {
158        self.head = self.tail;
159        self
160    }
161
162    /// Scans forward until it finds the next Token in the input
163    pub fn scan(&mut self) -> Result<Token, LexError> {
164        use TKind::*;
165        // !"#%&'()*+,-./:;<=>?@[\\]^`{|}~
166        let tok = match self
167            .skip_whitespace()
168            .start_token()
169            .peek()
170            .ok_or_else(|| self.error(EOF))?
171        {
172            '!' => Bang,
173            '"' => return self.string(),
174            '#' => Hash,
175            '$' => Dollar,
176            '%' => Rem,
177            '&' => Amp,
178            '\'' => return self.character(),
179            '(' => LParen,
180            ')' => RParen,
181            '*' => Star,
182            '+' => Plus,
183            ',' => return self.consume().trailing(Comma),
184            '-' => Minus,
185            '.' => Dot,
186            '/' => Slash,
187            '0' => Integer,
188            '1'..='9' => return self.digits::<10>(),
189            ':' => Colon,
190            ';' => Semi,
191            '<' => Lt,
192            '=' => Eq,
193            '>' => Gt,
194            '?' => Question,
195            '@' => At,
196            '[' => LBrack,
197            '\\' => Backslash,
198            ']' => RBrack,
199            '^' => Xor,
200            '`' => Grave,
201            '{' => LCurly,
202            '|' => Bar,
203            '}' => RCurly,
204            '~' => Tilde,
205            '_' => return self.identifier(),
206            c if is_xid_start(c) => return self.identifier(),
207            c => Err(self.error(Unexpected(c)))?,
208        };
209
210        // Handle digraphs
211        let tok = match (tok, self.consume().peek()) {
212            (Integer, Some('b')) => return self.consume().digits::<2>(),
213            (Integer, Some('d')) => return self.consume().digits::<10>(),
214            (Integer, Some('o')) => return self.consume().digits::<8>(),
215            (Integer, Some('x')) => return self.consume().digits::<16>(),
216            (Integer, Some('~')) => return self.consume().digits::<36>(),
217            (Integer, _) => return self.digits::<10>(),
218            (Amp, Some('&')) => AmpAmp,
219            (Amp, Some('=')) => AmpEq,
220            (Bang, Some('!')) => BangBang,
221            (Bang, Some('=')) => BangEq,
222            (Bar, Some('|')) => BarBar,
223            (Bar, Some('=')) => BarEq,
224            (Colon, Some(':')) => ColonColon,
225            (Dot, Some('.')) => DotDot,
226            (Eq, Some('=')) => EqEq,
227            (Eq, Some('>')) => FatArrow,
228            (Gt, Some('=')) => GtEq,
229            (Gt, Some('>')) => GtGt,
230            (Hash, Some('!')) => HashBang,
231            (Lt, Some('=')) => LtEq,
232            (Lt, Some('<')) => LtLt,
233            (Minus, Some('=')) => MinusEq,
234            (Minus, Some('>')) => Arrow,
235            (Plus, Some('=')) => PlusEq,
236            (Rem, Some('=')) => RemEq,
237            (Slash, Some('*')) => return Ok(self.block_comment()?.produce(Comment)),
238            (Slash, Some('=')) => SlashEq,
239            (Slash, Some('/')) => return self.line_comment(),
240            (Star, Some('=')) => StarEq,
241            (Xor, Some('=')) => XorEq,
242            (Xor, Some('^')) => XorXor,
243            _ => return Ok(self.produce(tok)),
244        };
245
246        // Handle trigraphs
247        let tok = match (tok, self.consume().peek()) {
248            (HashBang, Some('/')) => return self.line_comment(),
249            (DotDot, Some('.')) => DotDotDot,
250            (DotDot, Some('=')) => DotDotEq,
251            (GtGt, Some('=')) => GtGtEq,
252            (LtLt, Some('=')) => LtLtEq,
253            _ => return Ok(self.produce(tok)),
254        };
255
256        Ok(self.consume().produce(tok))
257    }
258
259    /// Elides the trailing [Token] `kind` when it comes before a list terminator.
260    pub fn trailing(&mut self, kind: TKind) -> Result<Token, LexError> {
261        Ok(match self.skip_whitespace().peek() {
262            // Some(')') => self.consume().produce(TKind::RParen), // maybe.
263            // Some(']') => self.consume().produce(TKind::RBrack),
264            Some('}') => self.consume().produce(TKind::RCurly),
265            _ => self.produce(kind),
266        })
267    }
268
269    /// Consumes characters until the lexer reaches a newline `'\n'`
270    pub fn line_comment(&mut self) -> Result<Token, LexError> {
271        let kind = match self.consume().peek() {
272            Some('/') => TKind::OutDoc,
273            Some('!') => TKind::InDoc,
274            _ => TKind::Comment,
275        };
276        while self.consume().peek().is_some_and(|c| c != '\n') {}
277        let (lexeme, _) = self.as_str();
278        let lexeme = lexeme
279            .strip_prefix("///")
280            .or_else(|| lexeme.strip_prefix("//!"))
281            .map(|lexeme| lexeme.strip_prefix(" ").unwrap_or(lexeme))
282            .unwrap_or(lexeme);
283
284        Ok(self.produce_with_lexeme(kind, Lexeme::String(lexeme.into())))
285    }
286
287    /// Consumes characters until the lexer reaches the end of a *nested* block comment.
288    /// This allows you to arbitrarily comment out code, even if that code has a block comment.
289    pub fn block_comment(&mut self) -> Result<&mut Self, LexError> {
290        self.consume();
291        while let Some(c) = self.take() {
292            match (c, self.peek()) {
293                ('/', Some('*')) => self.block_comment()?,
294                ('*', Some('/')) => return Ok(self.consume()),
295                _ => continue,
296            };
297        }
298        Err(self.error(UnterminatedBlockComment))
299    }
300
301    /// Consumes characters until it reaches a character not in [`is_xid_continue`].
302    ///
303    /// Always consumes the first character.
304    ///
305    /// Maps the result to either a [`TKind::Identifier`] or a [`TKind`] keyword.
306    pub fn identifier(&mut self) -> Result<Token, LexError> {
307        while self.consume().peek().is_some_and(is_xid_continue) {}
308        let (lexeme, _span) = self.as_str();
309        let token = self.produce(TKind::Identifier);
310        Ok(Token {
311            kind: match lexeme {
312                "as" => TKind::As,
313                "break" => TKind::Break,
314                "catch" => TKind::Catch,
315                "const" => TKind::Const,
316                "continue" => TKind::Continue,
317                "defer" => TKind::Defer,
318                "do" => TKind::Do,
319                "else" => TKind::Else,
320                "enum" => TKind::Enum,
321                "false" => TKind::False,
322                "fn" => TKind::Fn,
323                "for" => TKind::For,
324                "if" => TKind::If,
325                "impl" => TKind::Impl,
326                "in" => TKind::In,
327                "let" => TKind::Let,
328                "loop" => TKind::Loop,
329                "macro" => TKind::Macro,
330                "match" => TKind::Match,
331                "mod" => TKind::Mod,
332                "mut" => TKind::Mut,
333                "pub" => TKind::Pub,
334                "return" => TKind::Return,
335                "static" => TKind::Static,
336                "struct" => TKind::Struct,
337                "true" => TKind::True,
338                "try" => TKind::Try,
339                "type" => TKind::Type,
340                "use" => TKind::Use,
341                "while" => TKind::While,
342                _ => token.kind,
343            },
344            ..token
345        })
346    }
347
348    /// Eagerly parses a character literal starting at the current lexer position.
349    pub fn character(&mut self) -> Result<Token, LexError> {
350        let c = match self.consume().take() {
351            Some('\\') => self.escape()?,
352            Some(c) => c,
353            None => '\0',
354        };
355        if self.take().is_some_and(|c| c == '\'') {
356            Ok(self.produce_with_lexeme(TKind::Character, Lexeme::Char(c)))
357        } else {
358            Err(self.error(UnterminatedCharacter))
359        }
360    }
361
362    // Eagerly parses a string literal starting at the current lexer position.
363    pub fn string(&mut self) -> Result<Token, LexError> {
364        let mut lexeme = String::new();
365        self.consume();
366        loop {
367            lexeme.push(match self.take() {
368                None => Err(self.error(UnterminatedString))?,
369                Some('\\') => self.escape()?,
370                Some('"') => break,
371                Some(c) => c,
372            });
373        }
374        lexeme.shrink_to_fit();
375        Ok(self.produce_with_lexeme(TKind::String, Lexeme::String(lexeme)))
376    }
377
378    /// Parses a single escape sequence into its resulting char value.
379    pub fn escape(&mut self) -> Result<char, LexError> {
380        Ok(
381            match self.take().ok_or_else(|| self.error(UnexpectedEOF))? {
382                ' ' => '\u{a0}', // Non-breaking space
383                '0' => '\0',     // C0 Null Character
384                'a' => '\x07',   // C0 Acknowledge
385                'b' => '\x08',   // C0 Bell
386                'e' => '\x1b',   // C0 Escape
387                'f' => '\x0c',   // Form Feed
388                'n' => '\n',     // New Line
389                'r' => '\r',     // Carriage Return
390                't' => '\t',     // Tab
391                'u' => self.unicode_escape()?,
392                'x' => self.hex_escape()?,
393                c => c,
394            },
395        )
396    }
397
398    /// Parses two hex-digits and constructs a [char] out of them.
399    pub fn hex_escape(&mut self) -> Result<char, LexError> {
400        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
401        char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)))
402    }
403
404    /// Parses a sequence of `{}`-bracketed hex-digits and constructs a [char] out of them.
405    pub fn unicode_escape(&mut self) -> Result<char, LexError> {
406        self.next_if('{')
407            .ok_or_else(|| self.error(UnterminatedUnicodeEscape))?;
408        let mut out = 0;
409        while let Some(c) = self.take() {
410            if c == '}' {
411                return char::from_u32(out).ok_or_else(|| self.error(InvalidUnicodeEscape(out)));
412            }
413            out = out.saturating_mul(16).saturating_add(
414                c.to_digit(16)
415                    .ok_or_else(|| self.error(InvalidDigitForBase(c, 16)))?,
416            );
417        }
418        Err(self.error(UnterminatedUnicodeEscape))
419    }
420
421    /// Parses a sequence of digits (and underscores) in base `BASE`, where 2 <= `BASE` <= 36.
422    ///
423    /// If the sequence of digits exceeds the bounds of a [u128], the resulting number will wrap
424    /// around 2^128.
425    pub fn digits<const BASE: u32>(&mut self) -> Result<Token, LexError> {
426        let mut int: u128 = 0;
427        while let Some(c) = self.peek() {
428            int = match c.to_digit(BASE).ok_or(c) {
429                Err('_') => int,
430                Ok(c) => int
431                    .checked_mul(BASE as _)
432                    .and_then(|int| int.checked_add(c as _))
433                    .ok_or_else(|| self.error(IntegerOverflow))?,
434                _ => break,
435            };
436            self.consume();
437        }
438
439        Ok(self.produce_with_lexeme(TKind::Integer, Lexeme::Integer(int, BASE)))
440    }
441
442    /// Parses a single digit in base `BASE` as a u32, where 2 <= `BASE` <= 36.
443    pub fn digit<const BASE: u32>(&mut self) -> Result<u32, LexError> {
444        let digit = self.take().ok_or_else(|| self.error(UnexpectedEOF))?;
445        if let Some(digit) = digit.to_digit(BASE) {
446            Ok(digit)
447        } else {
448            Err(self.error(InvalidDigitForBase(digit, BASE)))
449        }
450    }
451}