cl_lexer/
lib.rs

1//! Converts a text file into tokens
2#![warn(clippy::all)]
3#![feature(decl_macro)]
4use cl_structures::span::Loc;
5use cl_token::{TokenKind as Kind, *};
6use std::{
7    iter::Peekable,
8    str::{CharIndices, FromStr},
9};
10use unicode_ident::*;
11
12#[cfg(test)]
13mod tests;
14
15pub mod lexer_iter {
16    //! Iterator over a [`Lexer`], returning [`LResult<Token>`]s
17    use super::{
18        Lexer, Token,
19        error::{LResult, Reason},
20    };
21
22    /// Iterator over a [`Lexer`], returning [`LResult<Token>`]s
23    pub struct LexerIter<'t> {
24        lexer: Lexer<'t>,
25    }
26    impl Iterator for LexerIter<'_> {
27        type Item = LResult<Token>;
28        fn next(&mut self) -> Option<Self::Item> {
29            match self.lexer.scan() {
30                Ok(v) => Some(Ok(v)),
31                Err(e) => {
32                    if e.reason == Reason::EndOfFile {
33                        None
34                    } else {
35                        Some(Err(e))
36                    }
37                }
38            }
39        }
40    }
41    impl<'t> IntoIterator for Lexer<'t> {
42        type Item = LResult<Token>;
43        type IntoIter = LexerIter<'t>;
44        fn into_iter(self) -> Self::IntoIter {
45            LexerIter { lexer: self }
46        }
47    }
48}
49
50/// The Lexer iterates over the characters in a body of text, searching for [Tokens](Token).
51///
52/// # Examples
53/// ```rust
54/// # use cl_lexer::Lexer;
55/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
56/// // Read in your code from somewhere
57/// let some_code = "
58/// fn main () {
59///     // TODO: code goes here!
60/// }
61/// ";
62/// // Create a lexer over your code
63/// let mut lexer = Lexer::new(some_code);
64/// // Scan for a single token
65/// let first_token = lexer.scan()?;
66/// println!("{first_token:?}");
67/// // Loop over all the rest of the tokens
68/// for token in lexer {
69/// #   let token: Result<_,()> = Ok(token?);
70///     match token {
71///         Ok(token) => println!("{token:?}"),
72///         Err(e) => eprintln!("{e:?}"),
73///     }
74/// }
75/// # Ok(()) }
76/// ```
77#[derive(Clone, Debug)]
78pub struct Lexer<'t> {
79    /// The source text
80    text: &'t str,
81    /// A peekable iterator over the source text
82    iter: Peekable<CharIndices<'t>>,
83    /// The end of the current token
84    head: usize,
85    /// The (line, col) end of the current token
86    head_loc: (u32, u32),
87    /// The start of the current token
88    tail: usize,
89    /// The (line, col) start of the current token
90    tail_loc: (u32, u32),
91}
92
93impl<'t> Lexer<'t> {
94    /// Creates a new [Lexer] over a [str]
95    pub fn new(text: &'t str) -> Self {
96        Self {
97            text,
98            iter: text.char_indices().peekable(),
99            head: 0,
100            head_loc: (1, 1),
101            tail: 0,
102            tail_loc: (1, 1),
103        }
104    }
105
106    /// Returns the current line
107    pub fn line(&self) -> u32 {
108        self.tail_loc.0
109    }
110
111    /// Returns the current column
112    pub fn col(&self) -> u32 {
113        self.tail_loc.1
114    }
115
116    /// Returns the current token's lexeme
117    fn lexeme(&mut self) -> &'t str {
118        &self.text[self.tail..self.head]
119    }
120
121    /// Peeks the next character without advancing the lexer
122    fn peek(&mut self) -> Option<char> {
123        self.iter.peek().map(|(_, c)| *c)
124    }
125
126    /// Advances the 'tail' (current position)
127    fn advance_tail(&mut self) {
128        let (idx, c) = self.iter.peek().copied().unwrap_or((self.text.len(), '\0'));
129        let (line, col) = &mut self.head_loc;
130        let diff = idx - self.head;
131
132        self.head = idx;
133        match c {
134            '\n' => {
135                *line += 1;
136                *col = 1;
137            }
138            _ => *col += diff as u32,
139        }
140    }
141
142    /// Takes the last-peeked character, or the next character if none peeked.
143    pub fn take(&mut self) -> Option<char> {
144        let (_, c) = self.iter.next()?;
145        self.advance_tail();
146        Some(c)
147    }
148
149    /// Takes the next char if it matches the `expected` char
150    pub fn next_if(&mut self, expected: char) -> Option<char> {
151        let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
152        self.advance_tail();
153        Some(c)
154    }
155
156    /// Consumes the last-peeked character, advancing the tail
157    pub fn consume(&mut self) -> &mut Self {
158        self.iter.next();
159        self.advance_tail();
160        self
161    }
162
163    /// Produces an [Error] at the start of the current token
164    fn error(&self, reason: Reason) -> Error {
165        Error { reason, line: self.line(), col: self.col() }
166    }
167
168    /// Produces a token with the current [lexeme](Lexer::lexeme) as its data
169    fn produce(&mut self, kind: Kind) -> LResult<Token> {
170        let lexeme = self.lexeme().to_owned();
171        self.produce_with(kind, lexeme)
172    }
173
174    /// Produces a token with the provided `data`
175    fn produce_with(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
176        let loc = self.tail_loc;
177        self.tail_loc = self.head_loc;
178        self.tail = self.head;
179        Ok(Token::new(kind, data, loc.0, loc.1))
180    }
181
182    /// Produces a token with no `data`
183    fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
184        self.produce_with(kind, ())
185    }
186
187    /// Consumes 0 or more whitespace
188    fn skip_whitespace(&mut self) -> &mut Self {
189        while self.peek().is_some_and(char::is_whitespace) {
190            let _ = self.consume();
191        }
192        self
193    }
194
195    /// Starts a new token
196    fn start_token(&mut self) -> &mut Self {
197        self.tail_loc = self.head_loc;
198        self.tail = self.head;
199        self
200    }
201
202    /// Scans through the text, searching for the next [Token]
203    pub fn scan(&mut self) -> LResult<Token> {
204        use TokenKind::*;
205        // !"#%&'()*+,-./:;<=>?@[\\]^`{|}~
206        let tok = match self
207            .skip_whitespace()
208            .start_token()
209            .peek()
210            .ok_or_else(|| self.error(Reason::EndOfFile))?
211        {
212            '!' => Bang,
213            '"' => return self.string(),
214            '#' => Hash,
215            '%' => Rem,
216            '&' => Amp,
217            '\'' => return self.character(),
218            '(' => LParen,
219            ')' => RParen,
220            '*' => Star,
221            '+' => Plus,
222            ',' => Comma,
223            '-' => Minus,
224            '.' => Dot,
225            '/' => Slash,
226            '0' => TokenKind::Literal,
227            '1'..='9' => return self.digits::<10>(),
228            ':' => Colon,
229            ';' => Semi,
230            '<' => Lt,
231            '=' => Eq,
232            '>' => Gt,
233            '?' => Question,
234            '@' => At,
235            '[' => LBrack,
236            '\\' => Backslash,
237            ']' => RBrack,
238            '^' => Xor,
239            '`' => Grave,
240            '{' => LCurly,
241            '|' => Bar,
242            '}' => RCurly,
243            '~' => Tilde,
244            '_' => return self.identifier(),
245            c if is_xid_start(c) => return self.identifier(),
246            e => {
247                let err = Err(self.error(Reason::UnexpectedChar(e)));
248                let _ = self.consume();
249                err?
250            }
251        };
252
253        // Handle digraphs
254        let tok = match (tok, self.consume().peek()) {
255            (Literal, Some('b')) => return self.consume().digits::<2>(),
256            (Literal, Some('d')) => return self.consume().digits::<10>(),
257            (Literal, Some('o')) => return self.consume().digits::<8>(),
258            (Literal, Some('x')) => return self.consume().digits::<16>(),
259            (Literal, Some('~')) => return self.consume().digits::<36>(),
260            (Literal, _) => return self.digits::<10>(),
261            (Amp, Some('&')) => AmpAmp,
262            (Amp, Some('=')) => AmpEq,
263            (Bang, Some('!')) => BangBang,
264            (Bang, Some('=')) => BangEq,
265            (Bar, Some('|')) => BarBar,
266            (Bar, Some('=')) => BarEq,
267            (Colon, Some(':')) => ColonColon,
268            (Dot, Some('.')) => DotDot,
269            (Eq, Some('=')) => EqEq,
270            (Eq, Some('>')) => FatArrow,
271            (Gt, Some('=')) => GtEq,
272            (Gt, Some('>')) => GtGt,
273            (Hash, Some('!')) => HashBang,
274            (Lt, Some('=')) => LtEq,
275            (Lt, Some('<')) => LtLt,
276            (Minus, Some('=')) => MinusEq,
277            (Minus, Some('>')) => Arrow,
278            (Plus, Some('=')) => PlusEq,
279            (Rem, Some('=')) => RemEq,
280            (Slash, Some('*')) => return self.block_comment()?.produce(Kind::Comment),
281            (Slash, Some('/')) => return self.line_comment(),
282            (Slash, Some('=')) => SlashEq,
283            (Star, Some('=')) => StarEq,
284            (Xor, Some('=')) => XorEq,
285            (Xor, Some('^')) => XorXor,
286            _ => return self.produce_op(tok),
287        };
288
289        // Handle trigraphs
290        let tok = match (tok, self.consume().peek()) {
291            (HashBang, Some('/')) => return self.line_comment(),
292            (DotDot, Some('=')) => DotDotEq,
293            (GtGt, Some('=')) => GtGtEq,
294            (LtLt, Some('=')) => LtLtEq,
295            _ => return self.produce_op(tok),
296        };
297
298        self.consume().produce_op(tok)
299    }
300}
301
302/// Comments
303impl Lexer<'_> {
304    /// Consumes until the next newline '\n', producing a [Comment](Kind::Comment)
305    fn line_comment(&mut self) -> LResult<Token> {
306        while self.consume().peek().is_some_and(|c| c != '\n') {}
307        self.produce(Kind::Comment)
308    }
309
310    /// Consumes nested block-comments. Does not produce by itself.
311    fn block_comment(&mut self) -> LResult<&mut Self> {
312        self.consume();
313        while let Some(c) = self.take() {
314            match (c, self.peek()) {
315                ('/', Some('*')) => self.block_comment()?,
316                ('*', Some('/')) => return Ok(self.consume()),
317                _ => continue,
318            };
319        }
320        Err(self.error(Reason::UnmatchedDelimiters('/')))
321    }
322}
323
324/// Identifiers
325impl Lexer<'_> {
326    /// Produces an [Identifier](Kind::Identifier) or keyword
327    fn identifier(&mut self) -> LResult<Token> {
328        while self.consume().peek().is_some_and(is_xid_continue) {}
329        if let Ok(keyword) = Kind::from_str(self.lexeme()) {
330            self.produce_with(keyword, ())
331        } else {
332            self.produce(Kind::Identifier)
333        }
334    }
335}
336
337/// Integers
338impl Lexer<'_> {
339    /// Produces a [Literal](Kind::Literal) with an integer or float value.
340    fn digits<const B: u32>(&mut self) -> LResult<Token> {
341        let mut value = 0;
342        while let Some(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
343            value = value * B as u128 + self.digit::<B>()? as u128;
344        }
345        // TODO: find a better way to handle floats in the tokenizer
346        match self.peek() {
347            Some('.') => {
348                // FIXME: hack: 0.. is not [0.0, '.']
349                if let Some('.') = self.clone().consume().take() {
350                    return self.produce_with(Kind::Literal, value);
351                }
352                let mut float = format!("{value}.");
353                self.consume();
354                while let Some(true) = self.peek().as_ref().map(char::is_ascii_digit) {
355                    float.push(self.iter.next().map(|(_, c)| c).unwrap_or_default());
356                }
357                let float = f64::from_str(&float).expect("must be parsable as float");
358                self.produce_with(Kind::Literal, float)
359            }
360            _ => self.produce_with(Kind::Literal, value),
361        }
362    }
363
364    /// Consumes a single digit of base [B](Lexer::digit)
365    fn digit<const B: u32>(&mut self) -> LResult<u32> {
366        let digit = self.take().ok_or_else(|| self.error(Reason::EndOfFile))?;
367        digit
368            .to_digit(B)
369            .ok_or_else(|| self.error(Reason::InvalidDigit(digit)))
370    }
371}
372
373/// Strings and characters
374impl Lexer<'_> {
375    /// Produces a [Literal](Kind::Literal) with a pre-escaped [String]
376    pub fn string(&mut self) -> Result<Token, Error> {
377        let mut lexeme = String::new();
378        let mut depth = 0;
379        self.consume();
380        loop {
381            lexeme.push(match self.take() {
382                None => Err(self.error(Reason::UnmatchedDelimiters('"')))?,
383                Some('\\') => self.unescape()?,
384                Some('"') if depth == 0 => break,
385                Some(c @ '{') => {
386                    depth += 1;
387                    c
388                }
389                Some(c @ '}') => {
390                    depth -= 1;
391                    c
392                }
393                Some(c) => c,
394            })
395        }
396        lexeme.shrink_to_fit();
397        self.produce_with(Kind::Literal, lexeme)
398    }
399
400    /// Produces a [Literal](Kind::Literal) with a pre-escaped [char]
401    fn character(&mut self) -> Result<Token, Error> {
402        let c = match self.consume().take() {
403            Some('\\') => self.unescape()?,
404            Some(c) => c,
405            None => '\0',
406        };
407        if self.take().is_some_and(|c| c == '\'') {
408            self.produce_with(Kind::Literal, c)
409        } else {
410            Err(self.error(Reason::UnmatchedDelimiters('\'')))
411        }
412    }
413
414    /// Unescapes a single character
415    #[rustfmt::skip]
416    fn unescape(&mut self) -> LResult<char> {
417        Ok(match self.take().ok_or_else(|| self.error(Reason::EndOfFile))? {
418            ' ' => '\u{a0}',
419            '0' => '\0',
420            'a' => '\x07',
421            'b' => '\x08',
422            'e' => '\x1b',
423            'f' => '\x0c',
424            'n' => '\n',
425            'r' => '\r',
426            't' => '\t',
427            'u' => self.unicode_escape()?,
428            'x' => self.hex_escape()?,
429            chr => chr,
430        })
431    }
432    /// Unescapes a single 2-digit hex escape
433    fn hex_escape(&mut self) -> LResult<char> {
434        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
435        char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)))
436    }
437
438    /// Unescapes a single \u{} unicode escape
439    pub fn unicode_escape(&mut self) -> Result<char, Error> {
440        self.next_if('{')
441            .ok_or_else(|| self.error(Reason::InvalidEscape('u')))?;
442        let mut out = 0;
443        while let Some(c) = self.take() {
444            if c == '}' {
445                return char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)));
446            }
447            out = out * 16
448                + c.to_digit(16)
449                    .ok_or_else(|| self.error(Reason::InvalidDigit(c)))?;
450        }
451        Err(self.error(Reason::UnmatchedDelimiters('}')))
452    }
453}
454
455impl<'t> From<&Lexer<'t>> for Loc {
456    fn from(value: &Lexer<'t>) -> Self {
457        Loc(value.line(), value.col())
458    }
459}
460
461use error::{Error, LResult, Reason};
462pub mod error {
463    //! [Error] type for the [Lexer](super::Lexer)
464    use std::fmt::Display;
465
466    /// Result type with [Err] = [Error]
467    pub type LResult<T> = Result<T, Error>;
468    #[derive(Clone, Debug, PartialEq, Eq)]
469    pub struct Error {
470        pub reason: Reason,
471        pub line: u32,
472        pub col: u32,
473    }
474    /// The reason for the [Error]
475    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
476    pub enum Reason {
477        /// Found an opening delimiter of type [char], but not the expected closing delimiter
478        UnmatchedDelimiters(char),
479        /// Found a character that doesn't belong to any [TokenKind](cl_token::TokenKind)
480        UnexpectedChar(char),
481        /// Found a character that's not valid in an escape sequence while looking for an escape
482        /// sequence
483        UnknownEscape(char),
484        /// Escape sequence contains invalid hexadecimal digit or unmatched braces
485        InvalidEscape(char),
486        /// Character is not a valid digit in the requested base
487        InvalidDigit(char),
488        /// Unicode escape does not map to a valid unicode code-point
489        BadUnicode(u32),
490        /// Reached end of input
491        EndOfFile,
492    }
493    impl Error {
494        /// Returns the [Reason] for this error
495        pub fn reason(&self) -> &Reason {
496            &self.reason
497        }
498        /// Returns the (line, col) where the error happened
499        pub fn location(&self) -> (u32, u32) {
500            (self.line, self.col)
501        }
502    }
503    impl std::error::Error for Error {}
504    impl Display for Error {
505        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
506            write!(f, "{}:{}: {}", self.line, self.col, self.reason)
507        }
508    }
509    impl Display for Reason {
510        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
511            match self {
512                Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c:?}` in input"},
513                Reason::UnexpectedChar(c) => write!(f, "Character `{c:?}` not expected"),
514                Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
515                Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
516                Reason::InvalidDigit(c) => write!(f, "`{c:?}` is not a valid digit"),
517                Reason::BadUnicode(c) => write!(f, "`\\u{{{c:x}}}` is not valid unicode"),
518                Reason::EndOfFile => write!(f, "Reached end of input"),
519            }
520        }
521    }
522}