nixd
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.h"
2
3#include "nixf/Basic/Range.h"
4
5#include <cassert>
6#include <cctype>
7
8using namespace nixf;
9using namespace tok;
10
11namespace {
12
13bool isUriSchemeChar(char Ch) {
14 // These characters are valid URI scheme char.
15 return std::isalnum(Ch) || Ch == '+' || Ch == '-' || Ch == '.';
16}
17
18bool isUriPathChar(char Ch) {
19 // These characters are valid URI path char.
20 return std::isalnum(Ch) || Ch == '%' || Ch == '/' || Ch == '?' || Ch == ':' ||
21 Ch == '@' || Ch == '&' || Ch == '=' || Ch == '+' || Ch == '$' ||
22 Ch == ',' || Ch == '-' || Ch == '_' || Ch == '.' || Ch == '!' ||
23 Ch == '~' || Ch == '*' || Ch == '\'';
24}
25
26bool isPathChar(char Ch) {
27 // These characters are valid path char.
28 return std::isdigit(Ch) || std::isalpha(Ch) || Ch == '.' || Ch == '_' ||
29 Ch == '-' || Ch == '+';
30}
31
32bool isIdentifierChar(char Ch) {
33 return std::isdigit(Ch) || std::isalpha(Ch) || Ch == '_' || Ch == '\'' ||
34 Ch == '-';
35}
36
37} // namespace
38
39using DK = Diagnostic::DiagnosticKind;
41
42std::optional<LexerCursorRange> Lexer::consumePrefix(std::string_view Prefix) {
43 LexerCursor Begin = cur();
44 if (peekPrefix(Prefix)) {
45 consume(Prefix.length());
46 return LexerCursorRange{Begin, cur()};
47 }
48 return std::nullopt;
49}
50
51std::optional<LexerCursorRange> Lexer::consumeManyOf(std::string_view Chars) {
52 if (eof())
53 return std::nullopt;
54 if (Chars.find(peekUnwrap()) != std::string_view::npos) {
55 auto Start = Cur;
56 while (!eof() && Chars.find(peekUnwrap()) != std::string_view::npos) {
57 consume();
58 }
59 return LexerCursorRange{Start, Cur};
60 }
61 return std::nullopt;
62}
63
64std::optional<char> Lexer::consumeOneOf(std::string_view Chars) {
65 if (eof())
66 return std::nullopt;
67 if (Chars.find(peekUnwrap()) != std::string_view::npos) {
68 char Ret = peekUnwrap();
69 consume();
70 return Ret;
71 }
72 return std::nullopt;
73}
74
75bool Lexer::consumeOne(char C) {
76 if (eof())
77 return false;
78 if (peek() == C) {
79 consume();
80 return true;
81 }
82 return false;
83}
84
85std::optional<LexerCursorRange> Lexer::consumeManyPathChar() {
86 if (eof())
87 return std::nullopt;
88 if (auto Ch = peek(); Ch && isPathChar(*Ch)) {
89 auto Start = Cur;
90 do {
91 consume();
92 Ch = peek();
93 } while (Ch && isPathChar(*Ch));
94 return LexerCursorRange{Start, Cur};
95 }
96 return std::nullopt;
97}
98
99bool Lexer::peekPrefix(std::string_view Prefix) {
100 if (Cur.Offset + Prefix.length() > Src.length())
101 return false;
102 if (remain().starts_with(Prefix)) {
103 return true;
104 }
105 return false;
106}
107
108bool Lexer::consumeWhitespaces() {
109 if (auto Ch = peek(); Ch && !std::isspace(*Ch))
110 return false;
111 do {
112 consume();
113 } while (!eof() && std::isspace(peekUnwrap()));
114 return true;
115}
116
117bool Lexer::consumeComments() {
118 if (eof())
119 return false;
120 if (std::optional<LexerCursorRange> BeginRange = consumePrefix("/*")) {
121 // Consume block comments until we meet '*/'
122 while (true) {
123 if (eof()) {
124 // There is no '*/' to terminate comments
125 Diagnostic &Diag = Diags.emplace_back(DK::DK_UnterminatedBComment,
126 LexerCursorRange{cur()});
127 Diag.note(NK::NK_BCommentBegin, *BeginRange);
128 Diag.fix("insert */").edit(TextEdit::mkInsertion(cur(), "*/"));
129 return true;
130 }
131 if (consumePrefix("*/"))
132 // We found the ending '*/'
133 return true;
134 consume(); // Consume a character (block comment body).
135 }
136 } else if (consumePrefix("#")) {
137 // Single line comments, consume blocks until we meet EOF or '\n' or '\r'
138 while (true) {
139 if (eof() || consumeEOL()) {
140 return true;
141 }
142 consume(); // Consume a character (single line comment body).
143 }
144 }
145 return false;
146}
147
148void Lexer::consumeTrivia() {
149 while (true) {
150 if (eof())
151 return;
152 if (consumeWhitespaces() || consumeComments())
153 continue;
154 return;
155 }
156}
157
158bool Lexer::lexFloatExp() {
159 // accept ([Ee][+-]?[0-9]+)?, the exponential part (after `.` of a float)
160 if (std::optional<char> ECh = consumeOneOf("Ee")) {
161 // [+-]?
162 consumeOneOf("+-");
163 // [0-9]+
164 if (!consumeManyDigits()) {
165 // not matching [0-9]+, error
166 Diags.emplace_back(DK::DK_FloatNoExp, curRange()) << std::string(1, *ECh);
167 return false;
168 }
169 }
170
171 return true;
172}
173
174void Lexer::lexNumbers() {
175 // numbers
176 //
177 // currently libexpr accepts:
178 // INT [0-9]+
179 // FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
180 //
181 // regex 'FLOAT' rejects floats like 00.0
182 //
183 // nix-repl> 000.3
184 // error: attempt to call something which is not a function but an integer
185 //
186 // at «string»:1:1:
187 //
188 // 1| 000.3
189 // | ^
190 //
191 // however, we accept [0-9]+\.[0-9]*([Ee][+-]?[0-9]+)?
192 // and issues a warning if it has leading zeros
193 // [0-9]+
194 auto Ch = consumeManyDigits();
195 assert(Ch.has_value() && "lexNumbers() must be called with a digit start");
196 if (peek() == '.') {
197 // float
198 Tok = tok_float;
199 consume();
200 // [0-9]*
201 consumeManyDigits();
202 lexFloatExp();
203 // Checking that if the float token has leading zeros.
204 std::string_view Prefix = Src.substr(Ch->lCur().Offset, 2);
205 if (Prefix.starts_with("0") && Prefix != "0.")
206 Diags.emplace_back(DK::DK_FloatLeadingZero, *Ch) << std::string(Prefix);
207 } else {
208 Tok = tok_int;
209 }
210}
211
212bool Lexer::consumePathStart() {
213 // PATH_CHAR [a-zA-Z0-9\.\_\-\+]
214 // PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
215 // HOME_PATH ~\/{PATH_CHAR}+
216 // PATH_SEG {PATH_CHAR}*\/
217 //
218
219 // Path, starts with any valid path char, or a leading "~/" home path, and
220 // must contain slashs Here, we look ahead characters, the must be valid path
221 // char And also check if it contains a slash.
222 LexerCursor Saved = cur();
223
224 // Nix accepts paths under the user's home directory, e.g. ~/foo. The '~'
225 // marker is only valid as a path start when immediately followed by '/'.
226 // Keep '~foo/bar' and interior '~' segments rejected, matching libexpr.
227 if (consumeOne('~')) {
228 if (consumeOne('/')) {
229 if (auto Ch = peek(); Ch && isPathChar(*Ch))
230 return true;
231 if (peekPrefix("${"))
232 return true;
233 }
234 Cur = Saved;
235 return false;
236 }
237
238 // {PATH_CHAR}*
239 consumeManyPathChar();
240
241 // Check if there is a slash, and also a path char right after such slash.
242 // If so, it is a path_fragment
243 if (consumeOne('/')) {
244 // Now, check if we are on a normal path char.
245 if (auto Ch = peek(); Ch && isPathChar(*Ch))
246 return true;
247 // Or, look ahead to see if is a dollar curly. ${
248 // This should be parsed as path-interpolation.
249 if (peekPrefix("${"))
250 return true;
251 }
252
253 // Otherwise, it is not a path, restore cursor.
254 Cur = Saved;
255 return false;
256}
257
258bool Lexer::consumeURI() {
259 // URI
260 // [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
261 //
262
263 LexerCursor Saved = cur();
264 // URI, starts with any valid URI scheme char, and must contain a colon
265 // Here, we look ahead characters, the must be valid path char
266 // And also check if it contains a colon.
267
268 while (!eof() && isUriSchemeChar(peekUnwrap()))
269 consume();
270
271 // Check if there is a colon, and also a URI path char right after such colon.
272 // If so, it is a uri
273 if (!eof() && peekUnwrap() == ':') {
274 consume();
275 if (!eof() && isUriPathChar(peekUnwrap())) {
276 do
277 consume();
278 while (!eof() && isUriPathChar(peekUnwrap()));
279 return true;
280 }
281 }
282
283 Cur = Saved;
284 return false;
285}
286
287bool Lexer::consumeSPath() {
288 // <{PATH_CHAR}+(\/{PATH_CHAR}+)*>
289 LexerCursor Saved = cur();
290
291 if (peek() == '<')
292 consume();
293
294 if (!eof() && isPathChar(peekUnwrap())) {
295 // {PATH_CHAR}+
296 while (!eof() && isPathChar(peekUnwrap()))
297 consume();
298 // (\/{PATH_CHAR}+)*
299 while (true) {
300 // \/
301 if (peek() == '/') {
302 consume();
303 // {PATH_CHAR}+
304 if (!eof() && isPathChar(peekUnwrap())) {
305 while (!eof() && isPathChar(peekUnwrap()))
306 consume();
307 continue;
308 }
309 }
310 break;
311 }
312 if (peek() == '>') {
313 consume();
314 return true;
315 }
316 }
317
318 Cur = Saved;
319 return false;
320}
321
322void Lexer::lexIdentifier() {
323 // identifier: [a-zA-Z_][a-zA-Z0-9_\'\-]*,
324 consume();
325 while (!eof() && isIdentifierChar(peekUnwrap()))
326 consume();
327}
328
329void Lexer::maybeKW() {
330 // For complex language this should be done on automaton or hashtable.
331 // But actually there are few keywords in nix language, so we just do
332 // comparison.
333#define TOK_KEYWORD(NAME) \
334 if (tokStr() == #NAME) { \
335 Tok = tok_kw_##NAME; \
336 return; \
337 }
338#include "TokenKinds.inc"
339#undef TOK_KEYWORD
340}
341
343 // Accept all characters, except ${, or "
344 // aaa/b//c
345 // Path
346 // PathFragment aaa/ <- lex()
347 // PathFragment b//c <- lexPath()
348 startToken();
349 Tok = tok_path_end;
350 if (eof()) {
351 return finishToken();
352 }
353
354 if (consumePrefix("${")) {
355 Tok = tok_dollar_curly;
356 return finishToken();
357 }
358
359 if (peekPrefix("~/")) {
360 Tok = tok_path_fragment;
361 consume(2);
362 while (!eof() && (isPathChar(peekUnwrap()) || peekUnwrap() == '/')) {
363 // Encountered an interpolation, stop here
364 if (peekPrefix("${"))
365 break;
366 consume();
367 }
368 return finishToken();
369 }
370
371 if (isPathChar(peekUnwrap()) || peekUnwrap() == '/') {
372 Tok = tok_path_fragment;
373 while (!eof() && (isPathChar(peekUnwrap()) || peekUnwrap() == '/')) {
374 // Encountered an interpolation, stop here
375 if (peekPrefix("${"))
376 break;
377 consume();
378 }
379 return finishToken();
380 }
381 return finishToken();
382}
383
385 // Accept all characters, except ${, or "
386 startToken();
387 if (eof()) {
388 Tok = tok_eof;
389 return finishToken();
390 }
391 switch (peekUnwrap()) {
392 case '"':
393 consume();
394 Tok = tok_dquote;
395 break;
396 case '\\':
397 // Consume two characters, for escaping
398 // NOTE: we may not want to break out Unicode wchar here, but libexpr does
399 // such ignoring
400 consume(2);
401 Tok = tok_string_escape;
402 break;
403 case '$':
404 if (consumePrefix("${")) {
405 Tok = tok_dollar_curly;
406 break;
407 }
408
409 // Otherwise, consider it is a part of string.
410 [[fallthrough]];
411 default:
412 Tok = tok_string_part;
413 for (; !eof();) {
414 // '\' escape
415 if (peekUnwrap() == '\\')
416 break;
417 if (peekUnwrap() == '"')
418 break;
419 // double-$, or \$, escapes ${.
420 // We will handle escaping on Sema
421 if (consumePrefix("$${"))
422 continue;
423 // Encountered a string interpolation, stop here
424 if (peekPrefix("${"))
425 break;
426 consume();
427 }
428 }
429 return finishToken();
430}
431
433 startToken();
434 if (eof()) {
435 Tok = tok_eof;
436 return finishToken();
437 }
438 if (consumePrefix("''")) {
439 if (consumePrefix("$") || consumePrefix("'")) {
440 Tok = tok_string_escape;
441 } else if (consumePrefix("\\")) {
442 // ''\ escapes any character
443 consume();
444 Tok = tok_string_escape;
445 } else {
446 Tok = tok_quote2;
447 }
448 return finishToken();
449 }
450
451 if (consumePrefix("${")) {
452 Tok = tok_dollar_curly;
453 return finishToken();
454 }
455
456 Tok = tok_string_part;
457 for (; !eof();) {
458 if (peekPrefix("''"))
459 break;
460 // double-$, or \$, escapes ${.
461 // We will handle escaping on Sema
462 if (consumePrefix("$${"))
463 continue;
464 // Encountered a string interpolation, stop here
465 if (peekPrefix("${"))
466 break;
467 consume();
468 }
469 return finishToken();
470}
471
473 // eat leading trivia
474 consumeTrivia();
475 startToken();
476
477 std::optional<char> Ch = peek();
478
479 if (!Ch) {
480 Tok = tok_eof;
481 return finishToken();
482 }
483
484 // Determine if this is a path, or identifier.
485 // a/b (including 1/2) should be considered as a whole path, not (a / b)
486 if (isPathChar(*Ch) || *Ch == '/' || *Ch == '~') {
487 if (consumePathStart()) {
488 // Form a concret token, this is a path part.
489 Tok = tok_path_fragment;
490 return finishToken();
491 }
492 }
493
494 // Determine if this is a URI.
495 if (std::isalpha(*Ch)) {
496 if (consumeURI()) {
497 Tok = tok_uri;
498 return finishToken();
499 }
500 }
501
502 if (std::isdigit(*Ch)) {
503 lexNumbers();
504 return finishToken();
505 }
506
507 if (std::isalpha(*Ch) || *Ch == '_') {
508
509 // So, this is not a path/URI, it should be an identifier.
510 lexIdentifier();
511 Tok = tok_id;
512 maybeKW();
513 return finishToken();
514 }
515
516 if (*Ch == '<') {
517 // Perhaps this is an "SPATH".
518 // e.g. <nixpkgs>
519 // <{PATH_CHAR}+(\/{PATH_CHAR}+)*>
520 if (consumeSPath()) {
521 Tok = tok_spath;
522 return finishToken();
523 }
524 }
525
526 switch (*Ch) {
527 case '\'':
528 if (consumePrefix("''"))
529 Tok = tok_quote2;
530 break;
531 case '+':
532 if (consumePrefix("++")) {
533 Tok = tok_op_concat;
534 } else {
535 consume();
536 Tok = tok_op_add;
537 }
538 break;
539 case '-':
540 if (consumePrefix("->")) {
541 Tok = tok_op_impl;
542 } else {
543 consume();
544 Tok = tok_op_negate;
545 }
546 break;
547 case '*':
548 consume();
549 Tok = tok_op_mul;
550 break;
551 case '/':
552 if (consumePrefix("//")) {
553 Tok = tok_op_update;
554 } else {
555 consume();
556 Tok = tok_op_div;
557 }
558 break;
559 case '|':
560 if (consumePrefix("||"))
561 Tok = tok_op_or;
562 if (consumePrefix("|>"))
563 Tok = tok_op_pipe_into;
564 break;
565 case '!':
566 if (consumePrefix("!=")) {
567 Tok = tok_op_neq;
568 } else {
569 consume();
570 Tok = tok_op_not;
571 }
572 break;
573 case '<':
574 if (consumePrefix("<=")) {
575 Tok = tok_op_le;
576 } else if (consumePrefix("<|")) {
577 Tok = tok_op_pipe_from;
578 } else {
579 consume();
580 Tok = tok_op_lt;
581 }
582 break;
583 case '>':
584 if (consumePrefix(">=")) {
585 Tok = tok_op_ge;
586 } else {
587 consume();
588 Tok = tok_op_gt;
589 }
590 break;
591 case '&':
592 if (consumePrefix("&&")) {
593 Tok = tok_op_and;
594 break;
595 }
596 break;
597 case '"':
598 consume();
599 Tok = tok_dquote;
600 break;
601 case '}':
602 consume();
603 Tok = tok_r_curly;
604 break;
605 case '.':
606 if (consumePrefix("...")) {
607 Tok = tok_ellipsis;
608 break;
609 } else {
610 consume();
611 Tok = tok_dot;
612 break;
613 }
614 case '@':
615 consume();
616 Tok = tok_at;
617 break;
618 case ':':
619 consume();
620 Tok = tok_colon;
621 break;
622 case '?':
623 consume();
624 Tok = tok_question;
625 break;
626 case ';':
627 consume();
628 Tok = tok_semi_colon;
629 break;
630 case '=':
631 if (consumePrefix("==")) {
632 Tok = tok_op_eq;
633 break;
634 } else {
635 consume();
636 Tok = tok_eq;
637 break;
638 }
639 case '{':
640 consume();
641 Tok = tok_l_curly;
642 break;
643 case '(':
644 consume();
645 Tok = tok_l_paren;
646 break;
647 case ')':
648 consume();
649 Tok = tok_r_paren;
650 break;
651 case '[':
652 consume();
653 Tok = tok_l_bracket;
654 break;
655 case ']':
656 consume();
657 Tok = tok_r_bracket;
658 break;
659 case ',':
660 consume();
661 Tok = tok_comma;
662 break;
663 case '$':
664 if (consumePrefix("${")) {
665 Tok = tok_dollar_curly;
666 break;
667 }
668 break;
669 }
670 if (Tok == tok_unknown)
671 consume();
672 return finishToken();
673}
Note::NoteKind NK
Definition Lexer.cpp:40
Diagnostic::DiagnosticKind DK
Definition Lexer.cpp:39
Lexer declaration. The lexer is a "stateful" lexer and highly tied to parser.
Note & note(Note::NoteKind Kind, LexerCursorRange Range)
Definition Diagnostic.h:197
Fix & fix(std::string Message)
Definition Diagnostic.h:203
Fix & edit(TextEdit Edit)
Definition Diagnostic.h:65
A point in the source file.
Definition Range.h:57
Token lex()
Definition Lexer.cpp:472
Token lexIndString()
Definition Lexer.cpp:432
Token lexPath()
Definition Lexer.cpp:342
Token lexString()
Definition Lexer.cpp:384
const LexerCursor & cur() const
Definition Lexer.h:131
NoteKind
Internal kind.
Definition Diagnostic.h:117
static TextEdit mkInsertion(LexerCursor P, std::string NewText)
Definition Diagnostic.h:35
A token. With it's kind, and the range in source code.
Definition Token.h:14