1#![warn(clippy::all)]
3#![feature(decl_macro)]
4use cl_structures::span::Loc;
5use cl_token::{TokenKind as Kind, *};
6use std::{
7    iter::Peekable,
8    str::{CharIndices, FromStr},
9};
10use unicode_ident::*;
11
12#[cfg(test)]
13mod tests;
14
15pub mod lexer_iter {
16    use super::{
18        Lexer, Token,
19        error::{LResult, Reason},
20    };
21
22    pub struct LexerIter<'t> {
24        lexer: Lexer<'t>,
25    }
26    impl Iterator for LexerIter<'_> {
27        type Item = LResult<Token>;
28        fn next(&mut self) -> Option<Self::Item> {
29            match self.lexer.scan() {
30                Ok(v) => Some(Ok(v)),
31                Err(e) => {
32                    if e.reason == Reason::EndOfFile {
33                        None
34                    } else {
35                        Some(Err(e))
36                    }
37                }
38            }
39        }
40    }
41    impl<'t> IntoIterator for Lexer<'t> {
42        type Item = LResult<Token>;
43        type IntoIter = LexerIter<'t>;
44        fn into_iter(self) -> Self::IntoIter {
45            LexerIter { lexer: self }
46        }
47    }
48}
49
50#[derive(Clone, Debug)]
78pub struct Lexer<'t> {
79    text: &'t str,
81    iter: Peekable<CharIndices<'t>>,
83    head: usize,
85    head_loc: (u32, u32),
87    tail: usize,
89    tail_loc: (u32, u32),
91}
92
93impl<'t> Lexer<'t> {
94    pub fn new(text: &'t str) -> Self {
96        Self {
97            text,
98            iter: text.char_indices().peekable(),
99            head: 0,
100            head_loc: (1, 1),
101            tail: 0,
102            tail_loc: (1, 1),
103        }
104    }
105
106    pub fn line(&self) -> u32 {
108        self.tail_loc.0
109    }
110
111    pub fn col(&self) -> u32 {
113        self.tail_loc.1
114    }
115
116    fn lexeme(&mut self) -> &'t str {
118        &self.text[self.tail..self.head]
119    }
120
121    fn peek(&mut self) -> Option<char> {
123        self.iter.peek().map(|(_, c)| *c)
124    }
125
126    fn advance_tail(&mut self) {
128        let (idx, c) = self.iter.peek().copied().unwrap_or((self.text.len(), '\0'));
129        let (line, col) = &mut self.head_loc;
130        let diff = idx - self.head;
131
132        self.head = idx;
133        match c {
134            '\n' => {
135                *line += 1;
136                *col = 1;
137            }
138            _ => *col += diff as u32,
139        }
140    }
141
142    pub fn take(&mut self) -> Option<char> {
144        let (_, c) = self.iter.next()?;
145        self.advance_tail();
146        Some(c)
147    }
148
149    pub fn next_if(&mut self, expected: char) -> Option<char> {
151        let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
152        self.advance_tail();
153        Some(c)
154    }
155
156    pub fn consume(&mut self) -> &mut Self {
158        self.iter.next();
159        self.advance_tail();
160        self
161    }
162
163    fn error(&self, reason: Reason) -> Error {
165        Error { reason, line: self.line(), col: self.col() }
166    }
167
168    fn produce(&mut self, kind: Kind) -> LResult<Token> {
170        let lexeme = self.lexeme().to_owned();
171        self.produce_with(kind, lexeme)
172    }
173
174    fn produce_with(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
176        let loc = self.tail_loc;
177        self.tail_loc = self.head_loc;
178        self.tail = self.head;
179        Ok(Token::new(kind, data, loc.0, loc.1))
180    }
181
182    fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
184        self.produce_with(kind, ())
185    }
186
187    fn skip_whitespace(&mut self) -> &mut Self {
189        while self.peek().is_some_and(char::is_whitespace) {
190            let _ = self.consume();
191        }
192        self
193    }
194
195    fn start_token(&mut self) -> &mut Self {
197        self.tail_loc = self.head_loc;
198        self.tail = self.head;
199        self
200    }
201
202    pub fn scan(&mut self) -> LResult<Token> {
204        use TokenKind::*;
205        let tok = match self
207            .skip_whitespace()
208            .start_token()
209            .peek()
210            .ok_or_else(|| self.error(Reason::EndOfFile))?
211        {
212            '!' => Bang,
213            '"' => return self.string(),
214            '#' => Hash,
215            '%' => Rem,
216            '&' => Amp,
217            '\'' => return self.character(),
218            '(' => LParen,
219            ')' => RParen,
220            '*' => Star,
221            '+' => Plus,
222            ',' => Comma,
223            '-' => Minus,
224            '.' => Dot,
225            '/' => Slash,
226            '0' => TokenKind::Literal,
227            '1'..='9' => return self.digits::<10>(),
228            ':' => Colon,
229            ';' => Semi,
230            '<' => Lt,
231            '=' => Eq,
232            '>' => Gt,
233            '?' => Question,
234            '@' => At,
235            '[' => LBrack,
236            '\\' => Backslash,
237            ']' => RBrack,
238            '^' => Xor,
239            '`' => Grave,
240            '{' => LCurly,
241            '|' => Bar,
242            '}' => RCurly,
243            '~' => Tilde,
244            '_' => return self.identifier(),
245            c if is_xid_start(c) => return self.identifier(),
246            e => {
247                let err = Err(self.error(Reason::UnexpectedChar(e)));
248                let _ = self.consume();
249                err?
250            }
251        };
252
253        let tok = match (tok, self.consume().peek()) {
255            (Literal, Some('b')) => return self.consume().digits::<2>(),
256            (Literal, Some('d')) => return self.consume().digits::<10>(),
257            (Literal, Some('o')) => return self.consume().digits::<8>(),
258            (Literal, Some('x')) => return self.consume().digits::<16>(),
259            (Literal, Some('~')) => return self.consume().digits::<36>(),
260            (Literal, _) => return self.digits::<10>(),
261            (Amp, Some('&')) => AmpAmp,
262            (Amp, Some('=')) => AmpEq,
263            (Bang, Some('!')) => BangBang,
264            (Bang, Some('=')) => BangEq,
265            (Bar, Some('|')) => BarBar,
266            (Bar, Some('=')) => BarEq,
267            (Colon, Some(':')) => ColonColon,
268            (Dot, Some('.')) => DotDot,
269            (Eq, Some('=')) => EqEq,
270            (Eq, Some('>')) => FatArrow,
271            (Gt, Some('=')) => GtEq,
272            (Gt, Some('>')) => GtGt,
273            (Hash, Some('!')) => HashBang,
274            (Lt, Some('=')) => LtEq,
275            (Lt, Some('<')) => LtLt,
276            (Minus, Some('=')) => MinusEq,
277            (Minus, Some('>')) => Arrow,
278            (Plus, Some('=')) => PlusEq,
279            (Rem, Some('=')) => RemEq,
280            (Slash, Some('*')) => return self.block_comment()?.produce(Kind::Comment),
281            (Slash, Some('/')) => return self.line_comment(),
282            (Slash, Some('=')) => SlashEq,
283            (Star, Some('=')) => StarEq,
284            (Xor, Some('=')) => XorEq,
285            (Xor, Some('^')) => XorXor,
286            _ => return self.produce_op(tok),
287        };
288
289        let tok = match (tok, self.consume().peek()) {
291            (HashBang, Some('/')) => return self.line_comment(),
292            (DotDot, Some('=')) => DotDotEq,
293            (GtGt, Some('=')) => GtGtEq,
294            (LtLt, Some('=')) => LtLtEq,
295            _ => return self.produce_op(tok),
296        };
297
298        self.consume().produce_op(tok)
299    }
300}
301
302impl Lexer<'_> {
304    fn line_comment(&mut self) -> LResult<Token> {
306        while self.consume().peek().is_some_and(|c| c != '\n') {}
307        self.produce(Kind::Comment)
308    }
309
310    fn block_comment(&mut self) -> LResult<&mut Self> {
312        self.consume();
313        while let Some(c) = self.take() {
314            match (c, self.peek()) {
315                ('/', Some('*')) => self.block_comment()?,
316                ('*', Some('/')) => return Ok(self.consume()),
317                _ => continue,
318            };
319        }
320        Err(self.error(Reason::UnmatchedDelimiters('/')))
321    }
322}
323
324impl Lexer<'_> {
326    fn identifier(&mut self) -> LResult<Token> {
328        while self.consume().peek().is_some_and(is_xid_continue) {}
329        if let Ok(keyword) = Kind::from_str(self.lexeme()) {
330            self.produce_with(keyword, ())
331        } else {
332            self.produce(Kind::Identifier)
333        }
334    }
335}
336
337impl Lexer<'_> {
339    fn digits<const B: u32>(&mut self) -> LResult<Token> {
341        let mut value = 0;
342        while let Some(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
343            value = value * B as u128 + self.digit::<B>()? as u128;
344        }
345        match self.peek() {
347            Some('.') => {
348                if let Some('.') = self.clone().consume().take() {
350                    return self.produce_with(Kind::Literal, value);
351                }
352                let mut float = format!("{value}.");
353                self.consume();
354                while let Some(true) = self.peek().as_ref().map(char::is_ascii_digit) {
355                    float.push(self.iter.next().map(|(_, c)| c).unwrap_or_default());
356                }
357                let float = f64::from_str(&float).expect("must be parsable as float");
358                self.produce_with(Kind::Literal, float)
359            }
360            _ => self.produce_with(Kind::Literal, value),
361        }
362    }
363
364    fn digit<const B: u32>(&mut self) -> LResult<u32> {
366        let digit = self.take().ok_or_else(|| self.error(Reason::EndOfFile))?;
367        digit
368            .to_digit(B)
369            .ok_or_else(|| self.error(Reason::InvalidDigit(digit)))
370    }
371}
372
373impl Lexer<'_> {
375    pub fn string(&mut self) -> Result<Token, Error> {
377        let mut lexeme = String::new();
378        let mut depth = 0;
379        self.consume();
380        loop {
381            lexeme.push(match self.take() {
382                None => Err(self.error(Reason::UnmatchedDelimiters('"')))?,
383                Some('\\') => self.unescape()?,
384                Some('"') if depth == 0 => break,
385                Some(c @ '{') => {
386                    depth += 1;
387                    c
388                }
389                Some(c @ '}') => {
390                    depth -= 1;
391                    c
392                }
393                Some(c) => c,
394            })
395        }
396        lexeme.shrink_to_fit();
397        self.produce_with(Kind::Literal, lexeme)
398    }
399
400    fn character(&mut self) -> Result<Token, Error> {
402        let c = match self.consume().take() {
403            Some('\\') => self.unescape()?,
404            Some(c) => c,
405            None => '\0',
406        };
407        if self.take().is_some_and(|c| c == '\'') {
408            self.produce_with(Kind::Literal, c)
409        } else {
410            Err(self.error(Reason::UnmatchedDelimiters('\'')))
411        }
412    }
413
414    #[rustfmt::skip]
416    fn unescape(&mut self) -> LResult<char> {
417        Ok(match self.take().ok_or_else(|| self.error(Reason::EndOfFile))? {
418            ' ' => '\u{a0}',
419            '0' => '\0',
420            'a' => '\x07',
421            'b' => '\x08',
422            'e' => '\x1b',
423            'f' => '\x0c',
424            'n' => '\n',
425            'r' => '\r',
426            't' => '\t',
427            'u' => self.unicode_escape()?,
428            'x' => self.hex_escape()?,
429            chr => chr,
430        })
431    }
432    fn hex_escape(&mut self) -> LResult<char> {
434        let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
435        char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)))
436    }
437
438    pub fn unicode_escape(&mut self) -> Result<char, Error> {
440        self.next_if('{')
441            .ok_or_else(|| self.error(Reason::InvalidEscape('u')))?;
442        let mut out = 0;
443        while let Some(c) = self.take() {
444            if c == '}' {
445                return char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)));
446            }
447            out = out * 16
448                + c.to_digit(16)
449                    .ok_or_else(|| self.error(Reason::InvalidDigit(c)))?;
450        }
451        Err(self.error(Reason::UnmatchedDelimiters('}')))
452    }
453}
454
455impl<'t> From<&Lexer<'t>> for Loc {
456    fn from(value: &Lexer<'t>) -> Self {
457        Loc(value.line(), value.col())
458    }
459}
460
461use error::{Error, LResult, Reason};
462pub mod error {
463    use std::fmt::Display;
465
466    pub type LResult<T> = Result<T, Error>;
468    #[derive(Clone, Debug, PartialEq, Eq)]
469    pub struct Error {
470        pub reason: Reason,
471        pub line: u32,
472        pub col: u32,
473    }
474    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
476    pub enum Reason {
477        UnmatchedDelimiters(char),
479        UnexpectedChar(char),
481        UnknownEscape(char),
484        InvalidEscape(char),
486        InvalidDigit(char),
488        BadUnicode(u32),
490        EndOfFile,
492    }
493    impl Error {
494        pub fn reason(&self) -> &Reason {
496            &self.reason
497        }
498        pub fn location(&self) -> (u32, u32) {
500            (self.line, self.col)
501        }
502    }
503    impl std::error::Error for Error {}
504    impl Display for Error {
505        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
506            write!(f, "{}:{}: {}", self.line, self.col, self.reason)
507        }
508    }
509    impl Display for Reason {
510        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
511            match self {
512                Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c:?}` in input"},
513                Reason::UnexpectedChar(c) => write!(f, "Character `{c:?}` not expected"),
514                Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
515                Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
516                Reason::InvalidDigit(c) => write!(f, "`{c:?}` is not a valid digit"),
517                Reason::BadUnicode(c) => write!(f, "`\\u{{{c:x}}}` is not valid unicode"),
518                Reason::EndOfFile => write!(f, "Reached end of input"),
519            }
520        }
521    }
522}