cl_lexer/
lib.rs

1//! Converts a text file into tokens
2#![warn(clippy::all)]
3#![feature(decl_macro)]
4use cl_structures::span::Loc;
5use cl_token::{TokenKind as Kind, *};
6use std::{
7    iter::Peekable,
8    str::{Chars, FromStr},
9};
10use unicode_ident::*;
11
12#[cfg(test)]
13mod tests;
14
15pub mod lexer_iter {
16    //! Iterator over a [`Lexer`], returning [`LResult<Token>`]s
17    use super::{
18        error::{LResult, Reason},
19        Lexer, Token,
20    };
21
22    /// Iterator over a [`Lexer`], returning [`LResult<Token>`]s
23    pub struct LexerIter<'t> {
24        lexer: Lexer<'t>,
25    }
26    impl Iterator for LexerIter<'_> {
27        type Item = LResult<Token>;
28        fn next(&mut self) -> Option<Self::Item> {
29            match self.lexer.scan() {
30                Ok(v) => Some(Ok(v)),
31                Err(e) => {
32                    if e.reason == Reason::EndOfFile {
33                        None
34                    } else {
35                        Some(Err(e))
36                    }
37                }
38            }
39        }
40    }
41    impl<'t> IntoIterator for Lexer<'t> {
42        type Item = LResult<Token>;
43        type IntoIter = LexerIter<'t>;
44        fn into_iter(self) -> Self::IntoIter {
45            LexerIter { lexer: self }
46        }
47    }
48}
49
50/// The Lexer iterates over the characters in a body of text, searching for [Tokens](Token).
51///
52/// # Examples
53/// ```rust
54/// # use cl_lexer::Lexer;
55/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
56/// // Read in your code from somewhere
57/// let some_code = "
58/// fn main () {
59///     // TODO: code goes here!
60/// }
61/// ";
62/// // Create a lexer over your code
63/// let mut lexer = Lexer::new(some_code);
64/// // Scan for a single token
65/// let first_token = lexer.scan()?;
66/// println!("{first_token:?}");
67/// // Loop over all the rest of the tokens
68/// for token in lexer {
69/// #   let token: Result<_,()> = Ok(token?);
70///     match token {
71///         Ok(token) => println!("{token:?}"),
72///         Err(e) => eprintln!("{e:?}"),
73///     }
74/// }
75/// # Ok(()) }
76/// ```
77#[derive(Clone, Debug)]
78pub struct Lexer<'t> {
79    iter: Peekable<Chars<'t>>,
80    start: usize,
81    start_loc: (u32, u32),
82    current: usize,
83    current_loc: (u32, u32),
84}
85
86impl<'t> Lexer<'t> {
87    /// Creates a new [Lexer] over a [str]
88    pub fn new(text: &'t str) -> Self {
89        Self {
90            iter: text.chars().peekable(),
91            start: 0,
92            start_loc: (1, 1),
93            current: 0,
94            current_loc: (1, 1),
95        }
96    }
97    /// Scans through the text, searching for the next [Token]
98    pub fn scan(&mut self) -> LResult<Token> {
99        match self.skip_whitespace().peek()? {
100            '{' => self.consume()?.produce_op(Kind::LCurly),
101            '}' => self.consume()?.produce_op(Kind::RCurly),
102            '[' => self.consume()?.produce_op(Kind::LBrack),
103            ']' => self.consume()?.produce_op(Kind::RBrack),
104            '(' => self.consume()?.produce_op(Kind::LParen),
105            ')' => self.consume()?.produce_op(Kind::RParen),
106            '&' => self.consume()?.amp(),
107            '@' => self.consume()?.produce_op(Kind::At),
108            '\\' => self.consume()?.produce_op(Kind::Backslash),
109            '!' => self.consume()?.bang(),
110            '|' => self.consume()?.bar(),
111            ':' => self.consume()?.colon(),
112            ',' => self.consume()?.produce_op(Kind::Comma),
113            '.' => self.consume()?.dot(),
114            '=' => self.consume()?.equal(),
115            '`' => self.consume()?.produce_op(Kind::Grave),
116            '>' => self.consume()?.greater(),
117            '#' => self.consume()?.hash(),
118            '<' => self.consume()?.less(),
119            '-' => self.consume()?.minus(),
120            '+' => self.consume()?.plus(),
121            '?' => self.consume()?.produce_op(Kind::Question),
122            '%' => self.consume()?.rem(),
123            ';' => self.consume()?.produce_op(Kind::Semi),
124            '/' => self.consume()?.slash(),
125            '*' => self.consume()?.star(),
126            '~' => self.consume()?.produce_op(Kind::Tilde),
127            '^' => self.consume()?.xor(),
128            '0' => self.consume()?.int_with_base(),
129            '1'..='9' => self.digits::<10>(),
130            '"' => self.consume()?.string(),
131            '\'' => self.consume()?.character(),
132            '_' => self.identifier(),
133            i if is_xid_start(i) => self.identifier(),
134            e => {
135                let err = Err(Error::unexpected_char(e, self.line(), self.col()));
136                let _ = self.consume();
137                err
138            }
139        }
140    }
141    /// Returns the current line
142    pub fn line(&self) -> u32 {
143        self.start_loc.0
144    }
145    /// Returns the current column
146    pub fn col(&self) -> u32 {
147        self.start_loc.1
148    }
149    fn next(&mut self) -> LResult<char> {
150        let out = self.peek();
151        self.consume()?;
152        out
153    }
154    fn peek(&mut self) -> LResult<char> {
155        self.iter
156            .peek()
157            .copied()
158            .ok_or(Error::end_of_file(self.line(), self.col()))
159    }
160    fn produce(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
161        let loc = self.start_loc;
162        self.start_loc = self.current_loc;
163        self.start = self.current;
164        Ok(Token::new(kind, data, loc.0, loc.1))
165    }
166    fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
167        self.produce(kind, ())
168    }
169    fn skip_whitespace(&mut self) -> &mut Self {
170        while let Ok(c) = self.peek() {
171            if !c.is_whitespace() {
172                break;
173            }
174            let _ = self.consume();
175        }
176        self.start = self.current;
177        self.start_loc = self.current_loc;
178        self
179    }
180    fn consume(&mut self) -> LResult<&mut Self> {
181        self.current += 1;
182        match self.iter.next() {
183            Some('\n') => {
184                let (line, col) = &mut self.current_loc;
185                *line += 1;
186                *col = 1;
187            }
188            Some(_) => self.current_loc.1 += 1,
189            None => Err(Error::end_of_file(self.line(), self.col()))?,
190        }
191        Ok(self)
192    }
193}
194/// Digraphs and trigraphs
195impl Lexer<'_> {
196    fn amp(&mut self) -> LResult<Token> {
197        match self.peek() {
198            Ok('&') => self.consume()?.produce_op(Kind::AmpAmp),
199            Ok('=') => self.consume()?.produce_op(Kind::AmpEq),
200            _ => self.produce_op(Kind::Amp),
201        }
202    }
203    fn bang(&mut self) -> LResult<Token> {
204        match self.peek() {
205            Ok('!') => self.consume()?.produce_op(Kind::BangBang),
206            Ok('=') => self.consume()?.produce_op(Kind::BangEq),
207            _ => self.produce_op(Kind::Bang),
208        }
209    }
210    fn bar(&mut self) -> LResult<Token> {
211        match self.peek() {
212            Ok('|') => self.consume()?.produce_op(Kind::BarBar),
213            Ok('=') => self.consume()?.produce_op(Kind::BarEq),
214            _ => self.produce_op(Kind::Bar),
215        }
216    }
217    fn colon(&mut self) -> LResult<Token> {
218        match self.peek() {
219            Ok(':') => self.consume()?.produce_op(Kind::ColonColon),
220            _ => self.produce_op(Kind::Colon),
221        }
222    }
223    fn dot(&mut self) -> LResult<Token> {
224        match self.peek() {
225            Ok('.') => {
226                if let Ok('=') = self.consume()?.peek() {
227                    self.consume()?.produce_op(Kind::DotDotEq)
228                } else {
229                    self.produce_op(Kind::DotDot)
230                }
231            }
232            _ => self.produce_op(Kind::Dot),
233        }
234    }
235    fn equal(&mut self) -> LResult<Token> {
236        match self.peek() {
237            Ok('=') => self.consume()?.produce_op(Kind::EqEq),
238            Ok('>') => self.consume()?.produce_op(Kind::FatArrow),
239            _ => self.produce_op(Kind::Eq),
240        }
241    }
242    fn greater(&mut self) -> LResult<Token> {
243        match self.peek() {
244            Ok('=') => self.consume()?.produce_op(Kind::GtEq),
245            Ok('>') => {
246                if let Ok('=') = self.consume()?.peek() {
247                    self.consume()?.produce_op(Kind::GtGtEq)
248                } else {
249                    self.produce_op(Kind::GtGt)
250                }
251            }
252            _ => self.produce_op(Kind::Gt),
253        }
254    }
255    fn hash(&mut self) -> LResult<Token> {
256        match self.peek() {
257            Ok('!') => self.consume()?.hashbang(),
258            _ => self.produce_op(Kind::Hash),
259        }
260    }
261    fn hashbang(&mut self) -> LResult<Token> {
262        match self.peek() {
263            Ok('/' | '\'') => self.line_comment(),
264            _ => self.produce_op(Kind::HashBang),
265        }
266    }
267    fn less(&mut self) -> LResult<Token> {
268        match self.peek() {
269            Ok('=') => self.consume()?.produce_op(Kind::LtEq),
270            Ok('<') => {
271                if let Ok('=') = self.consume()?.peek() {
272                    self.consume()?.produce_op(Kind::LtLtEq)
273                } else {
274                    self.produce_op(Kind::LtLt)
275                }
276            }
277            _ => self.produce_op(Kind::Lt),
278        }
279    }
280    fn minus(&mut self) -> LResult<Token> {
281        match self.peek() {
282            Ok('=') => self.consume()?.produce_op(Kind::MinusEq),
283            Ok('>') => self.consume()?.produce_op(Kind::Arrow),
284            _ => self.produce_op(Kind::Minus),
285        }
286    }
287    fn plus(&mut self) -> LResult<Token> {
288        match self.peek() {
289            Ok('=') => self.consume()?.produce_op(Kind::PlusEq),
290            _ => self.produce_op(Kind::Plus),
291        }
292    }
293    fn rem(&mut self) -> LResult<Token> {
294        match self.peek() {
295            Ok('=') => self.consume()?.produce_op(Kind::RemEq),
296            _ => self.produce_op(Kind::Rem),
297        }
298    }
299    fn slash(&mut self) -> LResult<Token> {
300        match self.peek() {
301            Ok('=') => self.consume()?.produce_op(Kind::SlashEq),
302            Ok('/') => self.consume()?.line_comment(),
303            Ok('*') => self.consume()?.block_comment(),
304            _ => self.produce_op(Kind::Slash),
305        }
306    }
307    fn star(&mut self) -> LResult<Token> {
308        match self.peek() {
309            Ok('=') => self.consume()?.produce_op(Kind::StarEq),
310            _ => self.produce_op(Kind::Star),
311        }
312    }
313    fn xor(&mut self) -> LResult<Token> {
314        match self.peek() {
315            Ok('=') => self.consume()?.produce_op(Kind::XorEq),
316            Ok('^') => self.consume()?.produce_op(Kind::XorXor),
317            _ => self.produce_op(Kind::Xor),
318        }
319    }
320}
321/// Comments
322impl Lexer<'_> {
323    fn line_comment(&mut self) -> LResult<Token> {
324        let mut comment = String::new();
325        while Ok('\n') != self.peek() {
326            comment.push(self.next()?);
327        }
328        self.produce(Kind::Comment, comment)
329    }
330    fn block_comment(&mut self) -> LResult<Token> {
331        let mut comment = String::new();
332        while let Ok(c) = self.next() {
333            if '*' == c && Ok('/') == self.peek() {
334                break;
335            }
336            comment.push(c);
337        }
338        self.consume()?.produce(Kind::Comment, comment)
339    }
340}
341/// Identifiers
342impl Lexer<'_> {
343    fn identifier(&mut self) -> LResult<Token> {
344        let mut out = String::from(self.xid_start()?);
345        while let Ok(c) = self.xid_continue() {
346            out.push(c)
347        }
348        if let Ok(keyword) = Kind::from_str(&out) {
349            self.produce(keyword, ())
350        } else {
351            self.produce(Kind::Identifier, TokenData::String(out))
352        }
353    }
354    fn xid_start(&mut self) -> LResult<char> {
355        match self.peek()? {
356            xid if xid == '_' || is_xid_start(xid) => {
357                self.consume()?;
358                Ok(xid)
359            }
360            bad => Err(Error::not_identifier(bad, self.line(), self.col())),
361        }
362    }
363    fn xid_continue(&mut self) -> LResult<char> {
364        match self.peek()? {
365            xid if is_xid_continue(xid) => {
366                self.consume()?;
367                Ok(xid)
368            }
369            bad => Err(Error::not_identifier(bad, self.line(), self.col())),
370        }
371    }
372}
373/// Integers
374impl Lexer<'_> {
375    fn int_with_base(&mut self) -> LResult<Token> {
376        match self.peek() {
377            Ok('x') => self.consume()?.digits::<16>(),
378            Ok('d') => self.consume()?.digits::<10>(),
379            Ok('o') => self.consume()?.digits::<8>(),
380            Ok('b') => self.consume()?.digits::<2>(),
381            Ok('0'..='9' | '.') => self.digits::<10>(),
382            _ => self.produce(Kind::Literal, 0),
383        }
384    }
385    fn digits<const B: u32>(&mut self) -> LResult<Token> {
386        let mut value = 0;
387        while let Ok(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
388            value = value * B as u128 + self.digit::<B>()? as u128;
389        }
390        // TODO: find a better way to handle floats in the tokenizer
391        match self.peek() {
392            Ok('.') => {
393                // FIXME: hack: 0.. is not [0.0, '.']
394                if let Ok('.') = self.clone().consume()?.next() {
395                    return self.produce(Kind::Literal, value);
396                }
397                let mut float = format!("{value}.");
398                self.consume()?;
399                while let Ok(true) = self.peek().as_ref().map(char::is_ascii_digit) {
400                    float.push(self.iter.next().unwrap_or_default());
401                }
402                let float = f64::from_str(&float).expect("must be parsable as float");
403                self.produce(Kind::Literal, float)
404            }
405            _ => self.produce(Kind::Literal, value),
406        }
407    }
408    fn digit<const B: u32>(&mut self) -> LResult<u32> {
409        let digit = self.peek()?;
410        self.consume()?;
411        digit
412            .to_digit(B)
413            .ok_or(Error::invalid_digit(digit, self.line(), self.col()))
414    }
415}
416/// Strings and characters
417impl Lexer<'_> {
418    fn string(&mut self) -> LResult<Token> {
419        let mut value = String::new();
420        while '"'
421            != self
422                .peek()
423                .map_err(|e| e.mask_reason(Reason::UnmatchedDelimiters('"')))?
424        {
425            value.push(self.unescape()?)
426        }
427        self.consume()?.produce(Kind::Literal, value)
428    }
429    fn character(&mut self) -> LResult<Token> {
430        let out = self.unescape()?;
431        match self.peek()? {
432            '\'' => self.consume()?.produce(Kind::Literal, out),
433            _ => Err(Error::unmatched_delimiters('\'', self.line(), self.col())),
434        }
435    }
436    /// Unescape a single character
437    fn unescape(&mut self) -> LResult<char> {
438        match self.next() {
439            Ok('\\') => (),
440            other => return other,
441        }
442        Ok(match self.next()? {
443            'a' => '\x07',
444            'b' => '\x08',
445            'f' => '\x0c',
446            'n' => '\n',
447            'r' => '\r',
448            't' => '\t',
449            'x' => self.hex_escape()?,
450            'u' => self.unicode_escape()?,
451            '0' => '\0',
452            chr => chr,
453        })
454    }
455    /// unescape a single 2-digit hex escape
456    fn hex_escape(&mut self) -> LResult<char> {
457        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
458        char::from_u32(out).ok_or(Error::bad_unicode(out, self.line(), self.col()))
459    }
460    /// unescape a single \u{} unicode escape
461    fn unicode_escape(&mut self) -> LResult<char> {
462        let mut out = 0;
463        let Ok('{') = self.peek() else {
464            return Err(Error::invalid_escape('u', self.line(), self.col()));
465        };
466        self.consume()?;
467        while let Ok(c) = self.peek() {
468            match c {
469                '}' => {
470                    self.consume()?;
471                    return char::from_u32(out).ok_or(Error::bad_unicode(
472                        out,
473                        self.line(),
474                        self.col(),
475                    ));
476                }
477                _ => out = (out << 4) + self.digit::<16>()?,
478            }
479        }
480        Err(Error::invalid_escape('u', self.line(), self.col()))
481    }
482}
483
484impl<'t> From<&Lexer<'t>> for Loc {
485    fn from(value: &Lexer<'t>) -> Self {
486        Loc(value.line(), value.col())
487    }
488}
489
490use error::{Error, LResult, Reason};
491pub mod error {
492    //! [Error] type for the [Lexer](super::Lexer)
493    use std::fmt::Display;
494
495    /// Result type with [Err] = [Error]
496    pub type LResult<T> = Result<T, Error>;
497    #[derive(Clone, Debug, PartialEq, Eq)]
498    pub struct Error {
499        pub reason: Reason,
500        pub line: u32,
501        pub col: u32,
502    }
503    /// The reason for the [Error]
504    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
505    pub enum Reason {
506        /// Found an opening delimiter of type [char], but not the expected closing delimiter
507        UnmatchedDelimiters(char),
508        /// Found a character that doesn't belong to any [TokenKind](cl_token::TokenKind)
509        UnexpectedChar(char),
510        /// Found a character that's not valid in identifiers while looking for an identifier
511        NotIdentifier(char),
512        /// Found a character that's not valid in an escape sequence while looking for an escape
513        /// sequence
514        UnknownEscape(char),
515        /// Escape sequence contains invalid hexadecimal digit or unmatched braces
516        InvalidEscape(char),
517        /// Character is not a valid digit in the requested base
518        InvalidDigit(char),
519        /// Base conversion requested, but the base character was not in the set of known
520        /// characters
521        UnknownBase(char),
522        /// Unicode escape does not map to a valid unicode code-point
523        BadUnicode(u32),
524        /// Reached end of input
525        EndOfFile,
526    }
527    error_impl! {
528        unmatched_delimiters(c: char) => Reason::UnmatchedDelimiters(c),
529        unexpected_char(c: char) => Reason::UnexpectedChar(c),
530        not_identifier(c: char) => Reason::NotIdentifier(c),
531        unknown_escape(e: char) => Reason::UnknownEscape(e),
532        invalid_escape(e: char) => Reason::InvalidEscape(e),
533        invalid_digit(digit: char) => Reason::InvalidDigit(digit),
534        unknown_base(base: char) => Reason::UnknownBase(base),
535        bad_unicode(value: u32) => Reason::BadUnicode(value),
536        end_of_file => Reason::EndOfFile,
537    }
538    impl Error {
539        /// Changes the [Reason] of this error
540        pub(super) fn mask_reason(self, reason: Reason) -> Self {
541            Self { reason, ..self }
542        }
543        /// Returns the [Reason] for this error
544        pub fn reason(&self) -> &Reason {
545            &self.reason
546        }
547        /// Returns the (line, col) where the error happened
548        pub fn location(&self) -> (u32, u32) {
549            (self.line, self.col)
550        }
551    }
552    macro error_impl ($($fn:ident$(( $($p:ident: $t:ty),* ))? => $reason:expr),*$(,)?) {
553        #[allow(dead_code)]
554        impl Error {
555            $(pub(super) fn $fn ($($($p: $t),*,)? line: u32, col: u32) -> Self {
556                Self { reason: $reason, line, col }
557            })*
558        }
559    }
560    impl std::error::Error for Error {}
561    impl Display for Error {
562        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
563            write!(f, "{}:{}: {}", self.line, self.col, self.reason)
564        }
565    }
566    impl Display for Reason {
567        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
568            match self {
569                Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c}` in input"},
570                Reason::UnexpectedChar(c) => write!(f, "Character `{c}` not expected"),
571                Reason::NotIdentifier(c) => write!(f, "Character `{c}` not valid in identifiers"),
572                Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
573                Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
574                Reason::InvalidDigit(c) => write!(f, "`{c}` is not a valid digit"),
575                Reason::UnknownBase(c) => write!(f, "`0{c}`... is not a valid base"),
576                Reason::BadUnicode(c) => write!(f, "`{c}` is not a valid unicode code-point"),
577                Reason::EndOfFile => write!(f, "Reached end of input"),
578            }
579        }
580    }
581}