nixd
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.h"
2
3#include "nixf/Basic/Range.h"
4
5#include <cassert>
6#include <cctype>
7
8using namespace nixf;
9using namespace tok;
10
11namespace {
12
13bool isUriSchemeChar(char Ch) {
14 // These characters are valid URI scheme char.
15 return std::isalnum(Ch) || Ch == '+' || Ch == '-' || Ch == '.';
16}
17
18bool isUriPathChar(char Ch) {
19 // These characters are valid URI path char.
20 return std::isalnum(Ch) || Ch == '%' || Ch == '/' || Ch == '?' || Ch == ':' ||
21 Ch == '@' || Ch == '&' || Ch == '=' || Ch == '+' || Ch == '$' ||
22 Ch == ',' || Ch == '-' || Ch == '_' || Ch == '.' || Ch == '!' ||
23 Ch == '~' || Ch == '*' || Ch == '\'';
24}
25
26bool isPathChar(char Ch) {
27 // These characters are valid path char.
28 return std::isdigit(Ch) || std::isalpha(Ch) || Ch == '.' || Ch == '_' ||
29 Ch == '-' || Ch == '+';
30}
31
32bool isIdentifierChar(char Ch) {
33 return std::isdigit(Ch) || std::isalpha(Ch) || Ch == '_' || Ch == '\'' ||
34 Ch == '-';
35}
36
37} // namespace
38
39using DK = Diagnostic::DiagnosticKind;
41
42std::optional<LexerCursorRange> Lexer::consumePrefix(std::string_view Prefix) {
43 LexerCursor Begin = cur();
44 if (peekPrefix(Prefix)) {
45 consume(Prefix.length());
46 return LexerCursorRange{Begin, cur()};
47 }
48 return std::nullopt;
49}
50
51std::optional<LexerCursorRange> Lexer::consumeManyOf(std::string_view Chars) {
52 if (eof())
53 return std::nullopt;
54 if (Chars.find(peekUnwrap()) != std::string_view::npos) {
55 auto Start = Cur;
56 while (!eof() && Chars.find(peekUnwrap()) != std::string_view::npos) {
57 consume();
58 }
59 return LexerCursorRange{Start, Cur};
60 }
61 return std::nullopt;
62}
63
64std::optional<char> Lexer::consumeOneOf(std::string_view Chars) {
65 if (eof())
66 return std::nullopt;
67 if (Chars.find(peekUnwrap()) != std::string_view::npos) {
68 char Ret = peekUnwrap();
69 consume();
70 return Ret;
71 }
72 return std::nullopt;
73}
74
75bool Lexer::consumeOne(char C) {
76 if (eof())
77 return false;
78 if (peek() == C) {
79 consume();
80 return true;
81 }
82 return false;
83}
84
85std::optional<LexerCursorRange> Lexer::consumeManyPathChar() {
86 if (eof())
87 return std::nullopt;
88 if (auto Ch = peek(); Ch && isPathChar(*Ch)) {
89 auto Start = Cur;
90 do {
91 consume();
92 Ch = peek();
93 } while (Ch && isPathChar(*Ch));
94 return LexerCursorRange{Start, Cur};
95 }
96 return std::nullopt;
97}
98
99bool Lexer::peekPrefix(std::string_view Prefix) {
100 if (Cur.Offset + Prefix.length() > Src.length())
101 return false;
102 if (remain().starts_with(Prefix)) {
103 return true;
104 }
105 return false;
106}
107
108bool Lexer::consumeWhitespaces() {
109 if (auto Ch = peek(); Ch && !std::isspace(*Ch))
110 return false;
111 do {
112 consume();
113 } while (!eof() && std::isspace(peekUnwrap()));
114 return true;
115}
116
117bool Lexer::consumeComments() {
118 if (eof())
119 return false;
120 if (std::optional<LexerCursorRange> BeginRange = consumePrefix("/*")) {
121 // Consume block comments until we meet '*/'
122 while (true) {
123 if (eof()) {
124 // There is no '*/' to terminate comments
125 Diagnostic &Diag = Diags.emplace_back(DK::DK_UnterminatedBComment,
127 Diag.note(NK::NK_BCommentBegin, *BeginRange);
128 Diag.fix("insert */").edit(TextEdit::mkInsertion(cur(), "*/"));
129 return true;
130 }
131 if (consumePrefix("*/"))
132 // We found the ending '*/'
133 return true;
134 consume(); // Consume a character (block comment body).
135 }
136 } else if (consumePrefix("#")) {
137 // Single line comments, consume blocks until we meet EOF or '\n' or '\r'
138 while (true) {
139 if (eof() || consumeEOL()) {
140 return true;
141 }
142 consume(); // Consume a character (single line comment body).
143 }
144 }
145 return false;
146}
147
148void Lexer::consumeTrivia() {
149 while (true) {
150 if (eof())
151 return;
152 if (consumeWhitespaces() || consumeComments())
153 continue;
154 return;
155 }
156}
157
158bool Lexer::lexFloatExp() {
159 // accept ([Ee][+-]?[0-9]+)?, the exponential part (after `.` of a float)
160 if (std::optional<char> ECh = consumeOneOf("Ee")) {
161 // [+-]?
162 consumeOneOf("+-");
163 // [0-9]+
164 if (!consumeManyDigits()) {
165 // not matching [0-9]+, error
166 Diags.emplace_back(DK::DK_FloatNoExp, curRange()) << std::string(1, *ECh);
167 return false;
168 }
169 }
170
171 return true;
172}
173
174void Lexer::lexNumbers() {
175 // numbers
176 //
177 // currently libexpr accepts:
178 // INT [0-9]+
179 // FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
180 //
181 // regex 'FLOAT' rejects floats like 00.0
182 //
183 // nix-repl> 000.3
184 // error: attempt to call something which is not a function but an integer
185 //
186 // at «string»:1:1:
187 //
188 // 1| 000.3
189 // | ^
190 //
191 // however, we accept [0-9]+\.[0-9]*([Ee][+-]?[0-9]+)?
192 // and issues a warning if it has leading zeros
193 // [0-9]+
194 auto Ch = consumeManyDigits();
195 assert(Ch.has_value() && "lexNumbers() must be called with a digit start");
196 if (peek() == '.') {
197 // float
198 Tok = tok_float;
199 consume();
200 // [0-9]*
201 consumeManyDigits();
202 lexFloatExp();
203 // Checking that if the float token has leading zeros.
204 std::string_view Prefix = Src.substr(Ch->lCur().Offset, 2);
205 if (Prefix.starts_with("0") && Prefix != "0.")
206 Diags.emplace_back(DK::DK_FloatLeadingZero, *Ch) << std::string(Prefix);
207 } else {
208 Tok = tok_int;
209 }
210}
211
212bool Lexer::consumePathStart() {
213 // PATH_CHAR [a-zA-Z0-9\.\_\-\+]
214 // PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
215 // PATH_SEG {PATH_CHAR}*\/
216 //
217
218 // Path, starts with any valid path char, and must contain slashs
219 // Here, we look ahead characters, the must be valid path char
220 // And also check if it contains a slash.
221 LexerCursor Saved = cur();
222
223 // {PATH_CHAR}*
224 consumeManyPathChar();
225
226 // Check if there is a slash, and also a path char right after such slash.
227 // If so, it is a path_fragment
228 if (consumeOne('/')) {
229 // Now, check if we are on a normal path char.
230 if (auto Ch = peek(); Ch && isPathChar(*Ch))
231 return true;
232 // Or, look ahead to see if is a dollar curly. ${
233 // This should be parsed as path-interpolation.
234 if (peekPrefix("${"))
235 return true;
236 }
237
238 // Otherwise, it is not a path, restore cursor.
239 Cur = Saved;
240 return false;
241}
242
243bool Lexer::consumeURI() {
244 // URI
245 // [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
246 //
247
248 LexerCursor Saved = cur();
249 // URI, starts with any valid URI scheme char, and must contain a colon
250 // Here, we look ahead characters, the must be valid path char
251 // And also check if it contains a colon.
252
253 while (!eof() && isUriSchemeChar(peekUnwrap()))
254 consume();
255
256 // Check if there is a colon, and also a URI path char right after such colon.
257 // If so, it is a uri
258 if (!eof() && peekUnwrap() == ':') {
259 consume();
260 if (!eof() && isUriPathChar(peekUnwrap())) {
261 do
262 consume();
263 while (!eof() && isUriPathChar(peekUnwrap()));
264 return true;
265 }
266 }
267
268 Cur = Saved;
269 return false;
270}
271
272bool Lexer::consumeSPath() {
273 // <{PATH_CHAR}+(\/{PATH_CHAR}+)*>
274 LexerCursor Saved = cur();
275
276 if (peek() == '<')
277 consume();
278
279 if (!eof() && isPathChar(peekUnwrap())) {
280 // {PATH_CHAR}+
281 while (!eof() && isPathChar(peekUnwrap()))
282 consume();
283 // (\/{PATH_CHAR}+)*
284 while (true) {
285 // \/
286 if (peek() == '/') {
287 consume();
288 // {PATH_CHAR}+
289 if (!eof() && isPathChar(peekUnwrap())) {
290 while (!eof() && isPathChar(peekUnwrap()))
291 consume();
292 continue;
293 }
294 }
295 break;
296 }
297 if (peek() == '>') {
298 consume();
299 return true;
300 }
301 }
302
303 Cur = Saved;
304 return false;
305}
306
307void Lexer::lexIdentifier() {
308 // identifier: [a-zA-Z_][a-zA-Z0-9_\'\-]*,
309 consume();
310 while (!eof() && isIdentifierChar(peekUnwrap()))
311 consume();
312}
313
314void Lexer::maybeKW() {
315 // For complex language this should be done on automaton or hashtable.
316 // But actually there are few keywords in nix language, so we just do
317 // comparison.
318#define TOK_KEYWORD(NAME) \
319 if (tokStr() == #NAME) { \
320 Tok = tok_kw_##NAME; \
321 return; \
322 }
324#undef TOK_KEYWORD
325}
326
328 // Accept all characters, except ${, or "
329 // aaa/b//c
330 // Path
331 // PathFragment aaa/ <- lex()
332 // PathFragment b//c <- lexPath()
333 startToken();
334 Tok = tok_path_end;
335 if (eof()) {
336 return finishToken();
337 }
338
339 if (consumePrefix("${")) {
340 Tok = tok_dollar_curly;
341 return finishToken();
342 }
343
344 if (isPathChar(peekUnwrap()) || peekUnwrap() == '/') {
345 Tok = tok_path_fragment;
346 while (!eof() && (isPathChar(peekUnwrap()) || peekUnwrap() == '/')) {
347 // Encountered an interpolation, stop here
348 if (peekPrefix("${"))
349 break;
350 consume();
351 }
352 return finishToken();
353 }
354 return finishToken();
355}
356
358 // Accept all characters, except ${, or "
359 startToken();
360 if (eof()) {
361 Tok = tok_eof;
362 return finishToken();
363 }
364 switch (peekUnwrap()) {
365 case '"':
366 consume();
367 Tok = tok_dquote;
368 break;
369 case '\\':
370 // Consume two characters, for escaping
371 // NOTE: we may not want to break out Unicode wchar here, but libexpr does
372 // such ignoring
373 consume(2);
374 Tok = tok_string_escape;
375 break;
376 case '$':
377 if (consumePrefix("${")) {
378 Tok = tok_dollar_curly;
379 break;
380 }
381
382 // Otherwise, consider it is a part of string.
383 [[fallthrough]];
384 default:
385 Tok = tok_string_part;
386 for (; !eof();) {
387 // '\' escape
388 if (peekUnwrap() == '\\')
389 break;
390 if (peekUnwrap() == '"')
391 break;
392 // double-$, or \$, escapes ${.
393 // We will handle escaping on Sema
394 if (consumePrefix("$${"))
395 continue;
396 // Encountered a string interpolation, stop here
397 if (peekPrefix("${"))
398 break;
399 consume();
400 }
401 }
402 return finishToken();
403}
404
406 startToken();
407 if (eof()) {
408 Tok = tok_eof;
409 return finishToken();
410 }
411 if (consumePrefix("''")) {
412 if (consumePrefix("$") || consumePrefix("'")) {
413 Tok = tok_string_escape;
414 } else if (consumePrefix("\\")) {
415 // ''\ escapes any character
416 consume();
417 Tok = tok_string_escape;
418 } else {
419 Tok = tok_quote2;
420 }
421 return finishToken();
422 }
423
424 if (consumePrefix("${")) {
425 Tok = tok_dollar_curly;
426 return finishToken();
427 }
428
429 Tok = tok_string_part;
430 for (; !eof();) {
431 if (peekPrefix("''"))
432 break;
433 // double-$, or \$, escapes ${.
434 // We will handle escaping on Sema
435 if (consumePrefix("$${"))
436 continue;
437 // Encountered a string interpolation, stop here
438 if (peekPrefix("${"))
439 break;
440 consume();
441 }
442 return finishToken();
443}
444
446 // eat leading trivia
447 consumeTrivia();
448 startToken();
449
450 std::optional<char> Ch = peek();
451
452 if (!Ch) {
453 Tok = tok_eof;
454 return finishToken();
455 }
456
457 // Determine if this is a path, or identifier.
458 // a/b (including 1/2) should be considered as a whole path, not (a / b)
459 if (isPathChar(*Ch) || *Ch == '/') {
460 if (consumePathStart()) {
461 // Form a concret token, this is a path part.
462 Tok = tok_path_fragment;
463 return finishToken();
464 }
465 }
466
467 // Determine if this is a URI.
468 if (std::isalpha(*Ch)) {
469 if (consumeURI()) {
470 Tok = tok_uri;
471 return finishToken();
472 }
473 }
474
475 if (std::isdigit(*Ch)) {
476 lexNumbers();
477 return finishToken();
478 }
479
480 if (std::isalpha(*Ch) || *Ch == '_') {
481
482 // So, this is not a path/URI, it should be an identifier.
483 lexIdentifier();
484 Tok = tok_id;
485 maybeKW();
486 return finishToken();
487 }
488
489 if (*Ch == '<') {
490 // Perhaps this is an "SPATH".
491 // e.g. <nixpkgs>
492 // <{PATH_CHAR}+(\/{PATH_CHAR}+)*>
493 if (consumeSPath()) {
494 Tok = tok_spath;
495 return finishToken();
496 }
497 }
498
499 switch (*Ch) {
500 case '\'':
501 if (consumePrefix("''"))
502 Tok = tok_quote2;
503 break;
504 case '+':
505 if (consumePrefix("++")) {
506 Tok = tok_op_concat;
507 } else {
508 consume();
509 Tok = tok_op_add;
510 }
511 break;
512 case '-':
513 if (consumePrefix("->")) {
514 Tok = tok_op_impl;
515 } else {
516 consume();
517 Tok = tok_op_negate;
518 }
519 break;
520 case '*':
521 consume();
522 Tok = tok_op_mul;
523 break;
524 case '/':
525 if (consumePrefix("//")) {
526 Tok = tok_op_update;
527 } else {
528 consume();
529 Tok = tok_op_div;
530 }
531 break;
532 case '|':
533 if (consumePrefix("||"))
534 Tok = tok_op_or;
535 if (consumePrefix("|>"))
536 Tok = tok_op_pipe_into;
537 break;
538 case '!':
539 if (consumePrefix("!=")) {
540 Tok = tok_op_neq;
541 } else {
542 consume();
543 Tok = tok_op_not;
544 }
545 break;
546 case '<':
547 if (consumePrefix("<=")) {
548 Tok = tok_op_le;
549 } else if (consumePrefix("<|")) {
550 Tok = tok_op_pipe_from;
551 } else {
552 consume();
553 Tok = tok_op_lt;
554 }
555 break;
556 case '>':
557 if (consumePrefix(">=")) {
558 Tok = tok_op_ge;
559 } else {
560 consume();
561 Tok = tok_op_gt;
562 }
563 break;
564 case '&':
565 if (consumePrefix("&&")) {
566 Tok = tok_op_and;
567 break;
568 }
569 break;
570 case '"':
571 consume();
572 Tok = tok_dquote;
573 break;
574 case '}':
575 consume();
576 Tok = tok_r_curly;
577 break;
578 case '.':
579 if (consumePrefix("...")) {
580 Tok = tok_ellipsis;
581 break;
582 } else {
583 consume();
584 Tok = tok_dot;
585 break;
586 }
587 case '@':
588 consume();
589 Tok = tok_at;
590 break;
591 case ':':
592 consume();
593 Tok = tok_colon;
594 break;
595 case '?':
596 consume();
597 Tok = tok_question;
598 break;
599 case ';':
600 consume();
601 Tok = tok_semi_colon;
602 break;
603 case '=':
604 if (consumePrefix("==")) {
605 Tok = tok_op_eq;
606 break;
607 } else {
608 consume();
609 Tok = tok_eq;
610 break;
611 }
612 case '{':
613 consume();
614 Tok = tok_l_curly;
615 break;
616 case '(':
617 consume();
618 Tok = tok_l_paren;
619 break;
620 case ')':
621 consume();
622 Tok = tok_r_paren;
623 break;
624 case '[':
625 consume();
626 Tok = tok_l_bracket;
627 break;
628 case ']':
629 consume();
630 Tok = tok_r_bracket;
631 break;
632 case ',':
633 consume();
634 Tok = tok_comma;
635 break;
636 case '$':
637 if (consumePrefix("${")) {
638 Tok = tok_dollar_curly;
639 break;
640 }
641 break;
642 }
643 if (Tok == tok_unknown)
644 consume();
645 return finishToken();
646}
Diagnostic::DiagnosticKind DK
Definition Lexer.cpp:39
Lexer declaration. The lexer is a "stateful" lexer and highly tied to parser.
Note & note(Note::NoteKind Kind, LexerCursorRange Range)
Definition Diagnostic.h:197
Fix & fix(std::string Message)
Definition Diagnostic.h:203
Fix & edit(TextEdit Edit)
Definition Diagnostic.h:65
A point in the source file.
Definition Range.h:57
Token lex()
Definition Lexer.cpp:445
Token lexIndString()
Definition Lexer.cpp:405
Token lexPath()
Definition Lexer.cpp:327
Token lexString()
Definition Lexer.cpp:357
const LexerCursor & cur() const
Definition Lexer.h:131
NoteKind
Internal kind.
Definition Diagnostic.h:117
static TextEdit mkInsertion(LexerCursor P, std::string NewText)
Definition Diagnostic.h:35
A token. With it's kind, and the range in source code.
Definition Token.h:55