1#![warn(clippy::all)]
3#![feature(decl_macro)]
4use cl_structures::span::Loc;
5use cl_token::{TokenKind as Kind, *};
6use std::{
7 iter::Peekable,
8 str::{Chars, FromStr},
9};
10use unicode_ident::*;
11
12#[cfg(test)]
13mod tests;
14
15pub mod lexer_iter {
16 use super::{
18 error::{LResult, Reason},
19 Lexer, Token,
20 };
21
22 pub struct LexerIter<'t> {
24 lexer: Lexer<'t>,
25 }
26 impl Iterator for LexerIter<'_> {
27 type Item = LResult<Token>;
28 fn next(&mut self) -> Option<Self::Item> {
29 match self.lexer.scan() {
30 Ok(v) => Some(Ok(v)),
31 Err(e) => {
32 if e.reason == Reason::EndOfFile {
33 None
34 } else {
35 Some(Err(e))
36 }
37 }
38 }
39 }
40 }
41 impl<'t> IntoIterator for Lexer<'t> {
42 type Item = LResult<Token>;
43 type IntoIter = LexerIter<'t>;
44 fn into_iter(self) -> Self::IntoIter {
45 LexerIter { lexer: self }
46 }
47 }
48}
49
50#[derive(Clone, Debug)]
78pub struct Lexer<'t> {
79 iter: Peekable<Chars<'t>>,
80 start: usize,
81 start_loc: (u32, u32),
82 current: usize,
83 current_loc: (u32, u32),
84}
85
86impl<'t> Lexer<'t> {
87 pub fn new(text: &'t str) -> Self {
89 Self {
90 iter: text.chars().peekable(),
91 start: 0,
92 start_loc: (1, 1),
93 current: 0,
94 current_loc: (1, 1),
95 }
96 }
97 pub fn scan(&mut self) -> LResult<Token> {
99 match self.skip_whitespace().peek()? {
100 '{' => self.consume()?.produce_op(Kind::LCurly),
101 '}' => self.consume()?.produce_op(Kind::RCurly),
102 '[' => self.consume()?.produce_op(Kind::LBrack),
103 ']' => self.consume()?.produce_op(Kind::RBrack),
104 '(' => self.consume()?.produce_op(Kind::LParen),
105 ')' => self.consume()?.produce_op(Kind::RParen),
106 '&' => self.consume()?.amp(),
107 '@' => self.consume()?.produce_op(Kind::At),
108 '\\' => self.consume()?.produce_op(Kind::Backslash),
109 '!' => self.consume()?.bang(),
110 '|' => self.consume()?.bar(),
111 ':' => self.consume()?.colon(),
112 ',' => self.consume()?.produce_op(Kind::Comma),
113 '.' => self.consume()?.dot(),
114 '=' => self.consume()?.equal(),
115 '`' => self.consume()?.produce_op(Kind::Grave),
116 '>' => self.consume()?.greater(),
117 '#' => self.consume()?.hash(),
118 '<' => self.consume()?.less(),
119 '-' => self.consume()?.minus(),
120 '+' => self.consume()?.plus(),
121 '?' => self.consume()?.produce_op(Kind::Question),
122 '%' => self.consume()?.rem(),
123 ';' => self.consume()?.produce_op(Kind::Semi),
124 '/' => self.consume()?.slash(),
125 '*' => self.consume()?.star(),
126 '~' => self.consume()?.produce_op(Kind::Tilde),
127 '^' => self.consume()?.xor(),
128 '0' => self.consume()?.int_with_base(),
129 '1'..='9' => self.digits::<10>(),
130 '"' => self.consume()?.string(),
131 '\'' => self.consume()?.character(),
132 '_' => self.identifier(),
133 i if is_xid_start(i) => self.identifier(),
134 e => {
135 let err = Err(Error::unexpected_char(e, self.line(), self.col()));
136 let _ = self.consume();
137 err
138 }
139 }
140 }
141 pub fn line(&self) -> u32 {
143 self.start_loc.0
144 }
145 pub fn col(&self) -> u32 {
147 self.start_loc.1
148 }
149 fn next(&mut self) -> LResult<char> {
150 let out = self.peek();
151 self.consume()?;
152 out
153 }
154 fn peek(&mut self) -> LResult<char> {
155 self.iter
156 .peek()
157 .copied()
158 .ok_or(Error::end_of_file(self.line(), self.col()))
159 }
160 fn produce(&mut self, kind: Kind, data: impl Into<TokenData>) -> LResult<Token> {
161 let loc = self.start_loc;
162 self.start_loc = self.current_loc;
163 self.start = self.current;
164 Ok(Token::new(kind, data, loc.0, loc.1))
165 }
166 fn produce_op(&mut self, kind: Kind) -> LResult<Token> {
167 self.produce(kind, ())
168 }
169 fn skip_whitespace(&mut self) -> &mut Self {
170 while let Ok(c) = self.peek() {
171 if !c.is_whitespace() {
172 break;
173 }
174 let _ = self.consume();
175 }
176 self.start = self.current;
177 self.start_loc = self.current_loc;
178 self
179 }
180 fn consume(&mut self) -> LResult<&mut Self> {
181 self.current += 1;
182 match self.iter.next() {
183 Some('\n') => {
184 let (line, col) = &mut self.current_loc;
185 *line += 1;
186 *col = 1;
187 }
188 Some(_) => self.current_loc.1 += 1,
189 None => Err(Error::end_of_file(self.line(), self.col()))?,
190 }
191 Ok(self)
192 }
193}
194impl Lexer<'_> {
196 fn amp(&mut self) -> LResult<Token> {
197 match self.peek() {
198 Ok('&') => self.consume()?.produce_op(Kind::AmpAmp),
199 Ok('=') => self.consume()?.produce_op(Kind::AmpEq),
200 _ => self.produce_op(Kind::Amp),
201 }
202 }
203 fn bang(&mut self) -> LResult<Token> {
204 match self.peek() {
205 Ok('!') => self.consume()?.produce_op(Kind::BangBang),
206 Ok('=') => self.consume()?.produce_op(Kind::BangEq),
207 _ => self.produce_op(Kind::Bang),
208 }
209 }
210 fn bar(&mut self) -> LResult<Token> {
211 match self.peek() {
212 Ok('|') => self.consume()?.produce_op(Kind::BarBar),
213 Ok('=') => self.consume()?.produce_op(Kind::BarEq),
214 _ => self.produce_op(Kind::Bar),
215 }
216 }
217 fn colon(&mut self) -> LResult<Token> {
218 match self.peek() {
219 Ok(':') => self.consume()?.produce_op(Kind::ColonColon),
220 _ => self.produce_op(Kind::Colon),
221 }
222 }
223 fn dot(&mut self) -> LResult<Token> {
224 match self.peek() {
225 Ok('.') => {
226 if let Ok('=') = self.consume()?.peek() {
227 self.consume()?.produce_op(Kind::DotDotEq)
228 } else {
229 self.produce_op(Kind::DotDot)
230 }
231 }
232 _ => self.produce_op(Kind::Dot),
233 }
234 }
235 fn equal(&mut self) -> LResult<Token> {
236 match self.peek() {
237 Ok('=') => self.consume()?.produce_op(Kind::EqEq),
238 Ok('>') => self.consume()?.produce_op(Kind::FatArrow),
239 _ => self.produce_op(Kind::Eq),
240 }
241 }
242 fn greater(&mut self) -> LResult<Token> {
243 match self.peek() {
244 Ok('=') => self.consume()?.produce_op(Kind::GtEq),
245 Ok('>') => {
246 if let Ok('=') = self.consume()?.peek() {
247 self.consume()?.produce_op(Kind::GtGtEq)
248 } else {
249 self.produce_op(Kind::GtGt)
250 }
251 }
252 _ => self.produce_op(Kind::Gt),
253 }
254 }
255 fn hash(&mut self) -> LResult<Token> {
256 match self.peek() {
257 Ok('!') => self.consume()?.hashbang(),
258 _ => self.produce_op(Kind::Hash),
259 }
260 }
261 fn hashbang(&mut self) -> LResult<Token> {
262 match self.peek() {
263 Ok('/' | '\'') => self.line_comment(),
264 _ => self.produce_op(Kind::HashBang),
265 }
266 }
267 fn less(&mut self) -> LResult<Token> {
268 match self.peek() {
269 Ok('=') => self.consume()?.produce_op(Kind::LtEq),
270 Ok('<') => {
271 if let Ok('=') = self.consume()?.peek() {
272 self.consume()?.produce_op(Kind::LtLtEq)
273 } else {
274 self.produce_op(Kind::LtLt)
275 }
276 }
277 _ => self.produce_op(Kind::Lt),
278 }
279 }
280 fn minus(&mut self) -> LResult<Token> {
281 match self.peek() {
282 Ok('=') => self.consume()?.produce_op(Kind::MinusEq),
283 Ok('>') => self.consume()?.produce_op(Kind::Arrow),
284 _ => self.produce_op(Kind::Minus),
285 }
286 }
287 fn plus(&mut self) -> LResult<Token> {
288 match self.peek() {
289 Ok('=') => self.consume()?.produce_op(Kind::PlusEq),
290 _ => self.produce_op(Kind::Plus),
291 }
292 }
293 fn rem(&mut self) -> LResult<Token> {
294 match self.peek() {
295 Ok('=') => self.consume()?.produce_op(Kind::RemEq),
296 _ => self.produce_op(Kind::Rem),
297 }
298 }
299 fn slash(&mut self) -> LResult<Token> {
300 match self.peek() {
301 Ok('=') => self.consume()?.produce_op(Kind::SlashEq),
302 Ok('/') => self.consume()?.line_comment(),
303 Ok('*') => self.consume()?.block_comment(),
304 _ => self.produce_op(Kind::Slash),
305 }
306 }
307 fn star(&mut self) -> LResult<Token> {
308 match self.peek() {
309 Ok('=') => self.consume()?.produce_op(Kind::StarEq),
310 _ => self.produce_op(Kind::Star),
311 }
312 }
313 fn xor(&mut self) -> LResult<Token> {
314 match self.peek() {
315 Ok('=') => self.consume()?.produce_op(Kind::XorEq),
316 Ok('^') => self.consume()?.produce_op(Kind::XorXor),
317 _ => self.produce_op(Kind::Xor),
318 }
319 }
320}
321impl Lexer<'_> {
323 fn line_comment(&mut self) -> LResult<Token> {
324 let mut comment = String::new();
325 while Ok('\n') != self.peek() {
326 comment.push(self.next()?);
327 }
328 self.produce(Kind::Comment, comment)
329 }
330 fn block_comment(&mut self) -> LResult<Token> {
331 let mut comment = String::new();
332 while let Ok(c) = self.next() {
333 if '*' == c && Ok('/') == self.peek() {
334 break;
335 }
336 comment.push(c);
337 }
338 self.consume()?.produce(Kind::Comment, comment)
339 }
340}
341impl Lexer<'_> {
343 fn identifier(&mut self) -> LResult<Token> {
344 let mut out = String::from(self.xid_start()?);
345 while let Ok(c) = self.xid_continue() {
346 out.push(c)
347 }
348 if let Ok(keyword) = Kind::from_str(&out) {
349 self.produce(keyword, ())
350 } else {
351 self.produce(Kind::Identifier, TokenData::String(out))
352 }
353 }
354 fn xid_start(&mut self) -> LResult<char> {
355 match self.peek()? {
356 xid if xid == '_' || is_xid_start(xid) => {
357 self.consume()?;
358 Ok(xid)
359 }
360 bad => Err(Error::not_identifier(bad, self.line(), self.col())),
361 }
362 }
363 fn xid_continue(&mut self) -> LResult<char> {
364 match self.peek()? {
365 xid if is_xid_continue(xid) => {
366 self.consume()?;
367 Ok(xid)
368 }
369 bad => Err(Error::not_identifier(bad, self.line(), self.col())),
370 }
371 }
372}
373impl Lexer<'_> {
375 fn int_with_base(&mut self) -> LResult<Token> {
376 match self.peek() {
377 Ok('x') => self.consume()?.digits::<16>(),
378 Ok('d') => self.consume()?.digits::<10>(),
379 Ok('o') => self.consume()?.digits::<8>(),
380 Ok('b') => self.consume()?.digits::<2>(),
381 Ok('0'..='9' | '.') => self.digits::<10>(),
382 _ => self.produce(Kind::Literal, 0),
383 }
384 }
385 fn digits<const B: u32>(&mut self) -> LResult<Token> {
386 let mut value = 0;
387 while let Ok(true) = self.peek().as_ref().map(char::is_ascii_alphanumeric) {
388 value = value * B as u128 + self.digit::<B>()? as u128;
389 }
390 match self.peek() {
392 Ok('.') => {
393 if let Ok('.') = self.clone().consume()?.next() {
395 return self.produce(Kind::Literal, value);
396 }
397 let mut float = format!("{value}.");
398 self.consume()?;
399 while let Ok(true) = self.peek().as_ref().map(char::is_ascii_digit) {
400 float.push(self.iter.next().unwrap_or_default());
401 }
402 let float = f64::from_str(&float).expect("must be parsable as float");
403 self.produce(Kind::Literal, float)
404 }
405 _ => self.produce(Kind::Literal, value),
406 }
407 }
408 fn digit<const B: u32>(&mut self) -> LResult<u32> {
409 let digit = self.peek()?;
410 self.consume()?;
411 digit
412 .to_digit(B)
413 .ok_or(Error::invalid_digit(digit, self.line(), self.col()))
414 }
415}
416impl Lexer<'_> {
418 fn string(&mut self) -> LResult<Token> {
419 let mut value = String::new();
420 while '"'
421 != self
422 .peek()
423 .map_err(|e| e.mask_reason(Reason::UnmatchedDelimiters('"')))?
424 {
425 value.push(self.unescape()?)
426 }
427 self.consume()?.produce(Kind::Literal, value)
428 }
429 fn character(&mut self) -> LResult<Token> {
430 let out = self.unescape()?;
431 match self.peek()? {
432 '\'' => self.consume()?.produce(Kind::Literal, out),
433 _ => Err(Error::unmatched_delimiters('\'', self.line(), self.col())),
434 }
435 }
436 fn unescape(&mut self) -> LResult<char> {
438 match self.next() {
439 Ok('\\') => (),
440 other => return other,
441 }
442 Ok(match self.next()? {
443 'a' => '\x07',
444 'b' => '\x08',
445 'f' => '\x0c',
446 'n' => '\n',
447 'r' => '\r',
448 't' => '\t',
449 'x' => self.hex_escape()?,
450 'u' => self.unicode_escape()?,
451 '0' => '\0',
452 chr => chr,
453 })
454 }
455 fn hex_escape(&mut self) -> LResult<char> {
457 let out = (self.digit::<16>()? << 4) + self.digit::<16>()?;
458 char::from_u32(out).ok_or(Error::bad_unicode(out, self.line(), self.col()))
459 }
460 fn unicode_escape(&mut self) -> LResult<char> {
462 let mut out = 0;
463 let Ok('{') = self.peek() else {
464 return Err(Error::invalid_escape('u', self.line(), self.col()));
465 };
466 self.consume()?;
467 while let Ok(c) = self.peek() {
468 match c {
469 '}' => {
470 self.consume()?;
471 return char::from_u32(out).ok_or(Error::bad_unicode(
472 out,
473 self.line(),
474 self.col(),
475 ));
476 }
477 _ => out = (out << 4) + self.digit::<16>()?,
478 }
479 }
480 Err(Error::invalid_escape('u', self.line(), self.col()))
481 }
482}
483
484impl<'t> From<&Lexer<'t>> for Loc {
485 fn from(value: &Lexer<'t>) -> Self {
486 Loc(value.line(), value.col())
487 }
488}
489
490use error::{Error, LResult, Reason};
491pub mod error {
492 use std::fmt::Display;
494
495 pub type LResult<T> = Result<T, Error>;
497 #[derive(Clone, Debug, PartialEq, Eq)]
498 pub struct Error {
499 pub reason: Reason,
500 pub line: u32,
501 pub col: u32,
502 }
503 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
505 pub enum Reason {
506 UnmatchedDelimiters(char),
508 UnexpectedChar(char),
510 NotIdentifier(char),
512 UnknownEscape(char),
515 InvalidEscape(char),
517 InvalidDigit(char),
519 UnknownBase(char),
522 BadUnicode(u32),
524 EndOfFile,
526 }
527 error_impl! {
528 unmatched_delimiters(c: char) => Reason::UnmatchedDelimiters(c),
529 unexpected_char(c: char) => Reason::UnexpectedChar(c),
530 not_identifier(c: char) => Reason::NotIdentifier(c),
531 unknown_escape(e: char) => Reason::UnknownEscape(e),
532 invalid_escape(e: char) => Reason::InvalidEscape(e),
533 invalid_digit(digit: char) => Reason::InvalidDigit(digit),
534 unknown_base(base: char) => Reason::UnknownBase(base),
535 bad_unicode(value: u32) => Reason::BadUnicode(value),
536 end_of_file => Reason::EndOfFile,
537 }
538 impl Error {
539 pub(super) fn mask_reason(self, reason: Reason) -> Self {
541 Self { reason, ..self }
542 }
543 pub fn reason(&self) -> &Reason {
545 &self.reason
546 }
547 pub fn location(&self) -> (u32, u32) {
549 (self.line, self.col)
550 }
551 }
552 macro error_impl ($($fn:ident$(( $($p:ident: $t:ty),* ))? => $reason:expr),*$(,)?) {
553 #[allow(dead_code)]
554 impl Error {
555 $(pub(super) fn $fn ($($($p: $t),*,)? line: u32, col: u32) -> Self {
556 Self { reason: $reason, line, col }
557 })*
558 }
559 }
560 impl std::error::Error for Error {}
561 impl Display for Error {
562 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
563 write!(f, "{}:{}: {}", self.line, self.col, self.reason)
564 }
565 }
566 impl Display for Reason {
567 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
568 match self {
569 Reason::UnmatchedDelimiters(c) => write! {f, "Unmatched `{c}` in input"},
570 Reason::UnexpectedChar(c) => write!(f, "Character `{c}` not expected"),
571 Reason::NotIdentifier(c) => write!(f, "Character `{c}` not valid in identifiers"),
572 Reason::UnknownEscape(c) => write!(f, "`\\{c}` is not a known escape sequence"),
573 Reason::InvalidEscape(c) => write!(f, "Escape sequence `\\{c}`... is malformed"),
574 Reason::InvalidDigit(c) => write!(f, "`{c}` is not a valid digit"),
575 Reason::UnknownBase(c) => write!(f, "`0{c}`... is not a valid base"),
576 Reason::BadUnicode(c) => write!(f, "`{c}` is not a valid unicode code-point"),
577 Reason::EndOfFile => write!(f, "Reached end of input"),
578 }
579 }
580 }
581}