1#![warn(clippy::all)]
3#![feature(decl_macro)]
4use cl_structures::span::Loc;
5use cl_token::{TokenKind as Kind, *};
6use std::{
7 iter::Peekable,
8 str::{CharIndices, FromStr},
9};
10use unicode_ident::*;
11
12#[cfg(test)]
13mod tests;
14
15pub mod lexer_iter {
16 use super::{
18 Lexer, Token,
19 error::{LResult, Reason},
20 };
21
22 pub struct LexerIter<'t> {
24 lexer: Lexer<'t>,
25 }
26 impl Iterator for LexerIter<'_> {
27 type Item = LResult<Token>;
28 fn next(&mut self) -> Option<Self::Item> {
29 match self.lexer.scan() {
30 Ok(v) => Some(Ok(v)),
31 Err(e) => {
32 if e.reason == Reason::EndOfFile {
33 None
34 } else {
35 Some(Err(e))
36 }
37 }
38 }
39 }
40 }
41 impl<'t> IntoIterator for Lexer<'t> {
42 type Item = LResult<Token>;
43 type IntoIter = LexerIter<'t>;
44 fn into_iter(self) -> Self::IntoIter {
45 LexerIter { lexer: self }
46 }
47 }
48}
49
50#[derive(Clone, Debug)]
78pub struct Lexer<'t> {
79 text: &'t str,
81 iter: Peekable<CharIndices<'t>>,
83 head: usize,
85 head_loc: (u32, u32),
87 tail: usize,
89 tail_loc: (u32, u32),
91}
92
93impl<'t> Lexer<'t> {
94 pub fn new(text: &'t str) -> Self {
96 Self {
97 text,
98 iter: text.char_indices().peekable(),
99 head: 0,
100 head_loc: (1, 1),
101 tail: 0,
102 tail_loc: (1, 1),
103 }
104 }
105
106 pub fn line(&self) -> u32 {
108 self.tail_loc.0
109 }
110
111 pub fn col(&self) -> u32 {
113 self.tail_loc.1
114 }
115
116 fn lexeme(&mut self) -> &'t str {
118 &self.text[self.tail..self.head]
119 }
120
121 fn peek(&mut self) -> Option<char> {
123 self.iter.peek().map(|(_, c)| *c)
124 }
125
126 fn advance_tail(&mut self) {
128 let (idx, c) = self.iter.peek().copied().unwrap_or((self.text.len(), '\0'));
129 let (line, col) = &mut self.head_loc;
130 let diff = idx - self.head;
131
132 self.head = idx;
133 match c {
134 '\n' => {
135 *line += 1;
136 *col = 1;
137 }
138 _ => *col += diff as u32,
139 }
140 }
141
142 pub fn take(&mut self) -> Option<char> {
144 let (_, c) = self.iter.next()?;
145 self.advance_tail();
146 Some(c)
147 }
148
149 pub fn next_if(&mut self, expected: char) -> Option<char> {
151 let (_, c) = self.iter.next_if(|&(_, c)| c == expected)?;
152 self.advance_tail();
153 Some(c)
154 }
155
156 pub fn consume(&mut self) -> &mut Self {
158 self.iter.next();
159 self.advance_tail();
160 self
161 }
162
163 fn error(&self, reason: Reason) -> Error {
165 Error { reason, line: self.line(), col: self.col() }
166 }
167
168 fn produce(&mut self, kind: Kind) -> LResult<Token> {
170 let lexeme = self.lexeme().to_owned();
171 self.produce_with(kind, lexeme)
172 }
173
174 fn produce_with(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
176 let loc = self.tail_loc;
177 self.tail_loc = self.head_loc;
178 self.tail = self.head;
179 Ok(Token::new(kind, data, loc.0, loc.1))
180 }
181
182 fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
184 self.produce_with(kind, ())
185 }
186
187 fn skip_whitespace(&mut self) -> &mut Self {
189 while self.peek().is_some_and(char::is_whitespace) {
190 let _ = self.consume();
191 }
192 self
193 }
194
195 fn start_token(&mut self) -> &mut Self {
197 self.tail_loc = self.head_loc;
198 self.tail = self.head;
199 self
200 }
201
202 pub fn scan(&mut self) -> LResult<Token> {
204 use TokenKind::*;
205 let tok = match self
207 .skip_whitespace()
208 .start_token()
209 .peek()
210 .ok_or_else(|| self.error(Reason::EndOfFile))?
211 {
212 '!' => Bang,
213 '"' => return self.string(),
214 '#' => Hash,
215 '%' => Rem,
216 '&' => Amp,
217 '\'' => return self.character(),
218 '(' => LParen,
219 ')' => RParen,
220 '*' => Star,
221 '+' => Plus,
222 ',' => Comma,
223 '-' => Minus,
224 '.' => Dot,
225 '/' => Slash,
226 '0' => TokenKind::Literal,
227 '1'..='9' => return self.digits::<10>(),
228 ':' => Colon,
229 ';' => Semi,
230 '<' => Lt,
231 '=' => Eq,
232 '>' => Gt,
233 '?' => Question,
234 '@' => At,
235 '[' => LBrack,
236 '\\' => Backslash,
237 ']' => RBrack,
238 '^' => Xor,
239 '`' => Grave,
240 '{' => LCurly,
241 '|' => Bar,
242 '}' => RCurly,
243 '~' => Tilde,
244 '_' => return self.identifier(),
245 c if is_xid_start(c) => return self.identifier(),
246 e => {
247 let err = Err(self.error(Reason::UnexpectedChar(e)));
248 let _ = self.consume();
249 err?
250 }
251 };
252
253 let tok = match (tok, self.consume().peek()) {
255 (Literal, Some('b')) => return self.consume().digits::<2>(),
256 (Literal, Some('d')) => return self.consume().digits::<10>(),
257 (Literal, Some('o')) => return self.consume().digits::<8>(),
258 (Literal, Some('x')) => return self.consume().digits::<16>(),
259 (Literal, Some('~')) => return self.consume().digits::<36>(),
260 (Literal, _) => return self.digits::<10>(),
261 (Amp, Some('&')) => AmpAmp,
262 (Amp, Some('=')) => AmpEq,
263 (Bang, Some('!')) => BangBang,
264 (Bang, Some('=')) => BangEq,
265 (Bar, Some('|')) => BarBar,
266 (Bar, Some('=')) => BarEq,
267 (Colon, Some(':')) => ColonColon,
268 (Dot, Some('.')) => DotDot,
269 (Eq, Some('=')) => EqEq,
270 (Eq, Some('>')) => FatArrow,
271 (Gt, Some('=')) => GtEq,
272 (Gt, Some('>')) => GtGt,
273 (Hash, Some('!')) => HashBang,
274 (Lt, Some('=')) => LtEq,
275 (Lt, Some('<')) => LtLt,
276 (Minus, Some('=')) => MinusEq,
277 (Minus, Some('>')) => Arrow,
278 (Plus, Some('=')) => PlusEq,
279 (Rem, Some('=')) => RemEq,
280 (Slash, Some('*')) => return self.block_comment()?.produce(Kind::Comment),
281 (Slash, Some('/')) => return self.line_comment(),
282 (Slash, Some('=')) => SlashEq,
283 (Star, Some('=')) => StarEq,
284 (Xor, Some('=')) => XorEq,
285 (Xor, Some('^')) => XorXor,
286 _ => return self.produce_op(tok),
287 };
288
289 let tok = match (tok, self.consume().peek()) {
291 (HashBang, Some('/')) => return self.line_comment(),
292 (DotDot, Some('=')) => DotDotEq,
293 (GtGt, Some('=')) => GtGtEq,
294 (LtLt, Some('=')) => LtLtEq,
295 _ => return self.produce_op(tok),
296 };
297
298 self.consume().produce_op(tok)
299 }
300}
301
302impl Lexer<'_> {
304 fn line_comment(&mut self) -> LResult<Token> {
306 while self.consume().peek().is_some_and(|c| c != '\n') {}
307 self.produce(Kind::Comment)
308 }
309
310 fn block_comment(&mut self) -> LResult<&mut Self> {
312 self.consume();
313 while let Some(c) = self.take() {
314 match (c, self.peek()) {
315 ('/', Some('*')) => self.block_comment()?,
316 ('*', Some('/')) => return Ok(self.consume()),
317 _ => continue,
318 };
319 }
320 Err(self.error(Reason::UnmatchedDelimiters('/')))
321 }
322}
323
324impl Lexer<'_> {
326 fn identifier(&mut self) -> LResult<Token> {
328 while self.consume().peek().is_some_and(is_xid_continue) {}
329 if let Ok(keyword) = Kind::from_str(self.lexeme()) {
330 self.produce_with(keyword, ())
331 } else {
332 self.produce(Kind::Identifier)
333 }
334 }
335}
336
337impl Lexer<'_> {
339 fn digits<const B: u32>(&mut self) -> LResult<Token> {
341 let mut value = 0;
342 while let Some(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
343 value = value * B as u128 + self.digit::<B>()? as u128;
344 }
345 match self.peek() {
347 Some('.') => {
348 if let Some('.') = self.clone().consume().take() {
350 return self.produce_with(Kind::Literal, value);
351 }
352 let mut float = format!("{value}.");
353 self.consume();
354 while let Some(true) = self.peek().as_ref().map(char::is_ascii_digit) {
355 float.push(self.iter.next().map(|(_, c)| c).unwrap_or_default());
356 }
357 let float = f64::from_str(&float).expect("must be parsable as float");
358 self.produce_with(Kind::Literal, float)
359 }
360 _ => self.produce_with(Kind::Literal, value),
361 }
362 }
363
364 fn digit<const B: u32>(&mut self) -> LResult<u32> {
366 let digit = self.take().ok_or_else(|| self.error(Reason::EndOfFile))?;
367 digit
368 .to_digit(B)
369 .ok_or_else(|| self.error(Reason::InvalidDigit(digit)))
370 }
371}
372
373impl Lexer<'_> {
375 pub fn string(&mut self) -> Result<Token, Error> {
377 let mut lexeme = String::new();
378 let mut depth = 0;
379 self.consume();
380 loop {
381 lexeme.push(match self.take() {
382 None => Err(self.error(Reason::UnmatchedDelimiters('"')))?,
383 Some('\\') => self.unescape()?,
384 Some('"') if depth == 0 => break,
385 Some(c @ '{') => {
386 depth += 1;
387 c
388 }
389 Some(c @ '}') => {
390 depth -= 1;
391 c
392 }
393 Some(c) => c,
394 })
395 }
396 lexeme.shrink_to_fit();
397 self.produce_with(Kind::Literal, lexeme)
398 }
399
400 fn character(&mut self) -> Result<Token, Error> {
402 let c = match self.consume().take() {
403 Some('\\') => self.unescape()?,
404 Some(c) => c,
405 None => '\0',
406 };
407 if self.take().is_some_and(|c| c == '\'') {
408 self.produce_with(Kind::Literal, c)
409 } else {
410 Err(self.error(Reason::UnmatchedDelimiters('\'')))
411 }
412 }
413
414 #[rustfmt::skip]
416 fn unescape(&mut self) -> LResult<char> {
417 Ok(match self.take().ok_or_else(|| self.error(Reason::EndOfFile))? {
418 ' ' => '\u{a0}',
419 '0' => '\0',
420 'a' => '\x07',
421 'b' => '\x08',
422 'e' => '\x1b',
423 'f' => '\x0c',
424 'n' => '\n',
425 'r' => '\r',
426 't' => '\t',
427 'u' => self.unicode_escape()?,
428 'x' => self.hex_escape()?,
429 chr => chr,
430 })
431 }
432 fn hex_escape(&mut self) -> LResult<char> {
434 let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
435 char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)))
436 }
437
438 pub fn unicode_escape(&mut self) -> Result<char, Error> {
440 self.next_if('{')
441 .ok_or_else(|| self.error(Reason::InvalidEscape('u')))?;
442 let mut out = 0;
443 while let Some(c) = self.take() {
444 if c == '}' {
445 return char::from_u32(out).ok_or_else(|| self.error(Reason::BadUnicode(out)));
446 }
447 out = out * 16
448 + c.to_digit(16)
449 .ok_or_else(|| self.error(Reason::InvalidDigit(c)))?;
450 }
451 Err(self.error(Reason::UnmatchedDelimiters('}')))
452 }
453}
454
455impl<'t> From<&Lexer<'t>> for Loc {
456 fn from(value: &Lexer<'t>) -> Self {
457 Loc(value.line(), value.col())
458 }
459}
460
461use error::{Error, LResult, Reason};
462pub mod error {
463 use std::fmt::Display;
465
466 pub type LResult<T> = Result<T, Error>;
468 #[derive(Clone, Debug, PartialEq, Eq)]
469 pub struct Error {
470 pub reason: Reason,
471 pub line: u32,
472 pub col: u32,
473 }
474 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
476 pub enum Reason {
477 UnmatchedDelimiters(char),
479 UnexpectedChar(char),
481 UnknownEscape(char),
484 InvalidEscape(char),
486 InvalidDigit(char),
488 BadUnicode(u32),
490 EndOfFile,
492 }
493 impl Error {
494 pub fn reason(&self) -> &Reason {
496 &self.reason
497 }
498 pub fn location(&self) -> (u32, u32) {
500 (self.line, self.col)
501 }
502 }
503 impl std::error::Error for Error {}
504 impl Display for Error {
505 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
506 write!(f, "{}:{}: {}", self.line, self.col, self.reason)
507 }
508 }
509 impl Display for Reason {
510 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
511 match self {
512 Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c:?}` in input"},
513 Reason::UnexpectedChar(c) => write!(f, "Character `{c:?}` not expected"),
514 Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
515 Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
516 Reason::InvalidDigit(c) => write!(f, "`{c:?}` is not a valid digit"),
517 Reason::BadUnicode(c) => write!(f, "`\\u{{{c:x}}}` is not valid unicode"),
518 Reason::EndOfFile => write!(f, "Reached end of input"),
519 }
520 }
521 }
522}