nixd
Loading...
Searching...
No Matches
Lexer.cpp
Go to the documentation of this file.
1#include "Lexer.h"
2
3#include "nixf/Basic/Range.h"
4
5#include <cassert>
6#include <cctype>
7
8using namespace nixf;
9using namespace tok;
10
11namespace {
12
13bool isUriSchemeChar(char Ch) {
14 // These characters are valid URI scheme char.
15 return std::isalnum(Ch) || Ch == '+' || Ch == '-' || Ch == '.';
16}
17
18bool isUriPathChar(char Ch) {
19 // These characters are valid URI path char.
20 return std::isalnum(Ch) || Ch == '%' || Ch == '/' || Ch == '?' || Ch == ':' ||
21 Ch == '@' || Ch == '&' || Ch == '=' || Ch == '+' || Ch == '$' ||
22 Ch == ',' || Ch == '-' || Ch == '_' || Ch == '.' || Ch == '!' ||
23 Ch == '~' || Ch == '*' || Ch == '\'';
24}
25
26bool isPathChar(char Ch) {
27 // These characters are valid path char.
28 return std::isdigit(Ch) || std::isalpha(Ch) || Ch == '.' || Ch == '_' ||
29 Ch == '-' || Ch == '+';
30}
31
32bool isIdentifierChar(char Ch) {
33 return std::isdigit(Ch) || std::isalpha(Ch) || Ch == '_' || Ch == '\'' ||
34 Ch == '-';
35}
36
37} // namespace
38
39using DK = Diagnostic::DiagnosticKind;
41
42std::optional<LexerCursorRange> Lexer::consumePrefix(std::string_view Prefix) {
43 LexerCursor Begin = cur();
44 if (peekPrefix(Prefix)) {
45 consume(Prefix.length());
46 return LexerCursorRange{Begin, cur()};
47 }
48 return std::nullopt;
49}
50
51std::optional<LexerCursorRange> Lexer::consumeManyOf(std::string_view Chars) {
52 if (eof())
53 return std::nullopt;
54 if (Chars.find(peekUnwrap()) != std::string_view::npos) {
55 auto Start = Cur;
56 while (!eof() && Chars.find(peekUnwrap()) != std::string_view::npos) {
57 consume();
58 }
59 return LexerCursorRange{Start, Cur};
60 }
61 return std::nullopt;
62}
63
64std::optional<char> Lexer::consumeOneOf(std::string_view Chars) {
65 if (eof())
66 return std::nullopt;
67 if (Chars.find(peekUnwrap()) != std::string_view::npos) {
68 char Ret = peekUnwrap();
69 consume();
70 return Ret;
71 }
72 return std::nullopt;
73}
74
75bool Lexer::consumeOne(char C) {
76 if (eof())
77 return false;
78 if (peek() == C) {
79 consume();
80 return true;
81 }
82 return false;
83}
84
85std::optional<LexerCursorRange> Lexer::consumeManyPathChar() {
86 if (eof())
87 return std::nullopt;
88 if (auto Ch = peek(); Ch && isPathChar(*Ch)) {
89 auto Start = Cur;
90 do {
91 consume();
92 Ch = peek();
93 } while (Ch && isPathChar(*Ch));
94 return LexerCursorRange{Start, Cur};
95 }
96 return std::nullopt;
97}
98
99bool Lexer::peekPrefix(std::string_view Prefix) {
100 if (Cur.Offset + Prefix.length() > Src.length())
101 return false;
102 if (remain().starts_with(Prefix)) {
103 return true;
104 }
105 return false;
106}
107
108bool Lexer::consumeWhitespaces() {
109 if (auto Ch = peek(); Ch && !std::isspace(*Ch))
110 return false;
111 do {
112 consume();
113 } while (!eof() && std::isspace(peekUnwrap()));
114 return true;
115}
116
117bool Lexer::consumeComments() {
118 if (eof())
119 return false;
120 if (std::optional<LexerCursorRange> BeginRange = consumePrefix("/*")) {
121 // Consume block comments until we meet '*/'
122 while (true) {
123 if (eof()) {
124 // There is no '*/' to terminate comments
125 Diagnostic &Diag = Diags.emplace_back(DK::DK_UnterminatedBComment,
127 Diag.note(NK::NK_BCommentBegin, *BeginRange);
128 Diag.fix("insert */").edit(TextEdit::mkInsertion(cur(), "*/"));
129 return true;
130 }
131 if (consumePrefix("*/"))
132 // We found the ending '*/'
133 return true;
134 consume(); // Consume a character (block comment body).
135 }
136 } else if (consumePrefix("#")) {
137 // Single line comments, consume blocks until we meet EOF or '\n' or '\r'
138 while (true) {
139 if (eof() || consumeEOL()) {
140 return true;
141 }
142 consume(); // Consume a character (single line comment body).
143 }
144 }
145 return false;
146}
147
148void Lexer::consumeTrivia() {
149 while (true) {
150 if (eof())
151 return;
152 if (consumeWhitespaces() || consumeComments())
153 continue;
154 return;
155 }
156}
157
158bool Lexer::lexFloatExp() {
159 // accept ([Ee][+-]?[0-9]+)?, the exponential part (after `.` of a float)
160 if (std::optional<char> ECh = consumeOneOf("Ee")) {
161 // [+-]?
162 consumeOneOf("+-");
163 // [0-9]+
164 if (!consumeManyDigits()) {
165 // not matching [0-9]+, error
166 Diags.emplace_back(DK::DK_FloatNoExp, curRange()) << std::string(1, *ECh);
167 return false;
168 }
169 }
170
171 return true;
172}
173
174void Lexer::lexNumbers() {
175 // numbers
176 //
177 // currently libexpr accepts:
178 // INT [0-9]+
179 // FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
180 //
181 // regex 'FLOAT' rejects floats like 00.0
182 //
183 // nix-repl> 000.3
184 // error: attempt to call something which is not a function but an integer
185 //
186 // at «string»:1:1:
187 //
188 // 1| 000.3
189 // | ^
190 //
191 // however, we accept [0-9]+\.[0-9]*([Ee][+-]?[0-9]+)?
192 // and issues a warning if it has leading zeros
193 // [0-9]+
194 auto Ch = consumeManyDigits();
195 assert(Ch.has_value() && "lexNumbers() must be called with a digit start");
196 if (peek() == '.') {
197 // float
198 Tok = tok_float;
199 consume();
200 // [0-9]*
201 consumeManyDigits();
202 lexFloatExp();
203 // Checking that if the float token has leading zeros.
204 std::string_view Prefix = Src.substr(Ch->lCur().Offset, 2);
205 if (Prefix.starts_with("0") && Prefix != "0.")
206 Diags.emplace_back(DK::DK_FloatLeadingZero, *Ch) << std::string(Prefix);
207 } else {
208 Tok = tok_int;
209 }
210}
211
212bool Lexer::consumePathStart() {
213 // PATH_CHAR [a-zA-Z0-9\.\_\-\+]
214 // PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
215 // PATH_SEG {PATH_CHAR}*\/
216 //
217
218 // Path, starts with any valid path char, and must contain slashs
219 // Here, we look ahead characters, the must be valid path char
220 // And also check if it contains a slash.
221 LexerCursor Saved = cur();
222
223 // {PATH_CHAR}*
224 consumeManyPathChar();
225
226 // Check if there is a slash, and also a path char right after such slash.
227 // If so, it is a path_fragment
228 if (consumeOne('/')) {
229 // Now, check if we are on a normal path char.
230 if (auto Ch = peek(); Ch && isPathChar(*Ch))
231 return true;
232 // Or, look ahead to see if is a dollar curly. ${
233 // This should be parsed as path-interpolation.
234 if (peekPrefix("${"))
235 return true;
236 }
237
238 // Otherwise, it is not a path, restore cursor.
239 Cur = Saved;
240 return false;
241}
242
243bool Lexer::consumeURI() {
244 // URI
245 // [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
246 //
247
248 LexerCursor Saved = cur();
249 // URI, starts with any valid URI scheme char, and must contain a colon
250 // Here, we look ahead characters, the must be valid path char
251 // And also check if it contains a colon.
252
253 while (!eof() && isUriSchemeChar(peekUnwrap()))
254 consume();
255
256 // Check if there is a colon, and also a URI path char right after such colon.
257 // If so, it is a uri
258 if (!eof() && peekUnwrap() == ':') {
259 consume();
260 if (!eof() && isUriPathChar(peekUnwrap())) {
261 do
262 consume();
263 while (!eof() && isUriPathChar(peekUnwrap()));
264 return true;
265 }
266 }
267
268 Cur = Saved;
269 return false;
270}
271
272bool Lexer::consumeSPath() {
273 // <{PATH_CHAR}+(\/{PATH_CHAR}+)*>
274 LexerCursor Saved = cur();
275
276 if (peek() == '<')
277 consume();
278
279 if (!eof() && isPathChar(peekUnwrap())) {
280 // {PATH_CHAR}+
281 while (!eof() && isPathChar(peekUnwrap()))
282 consume();
283 // (\/{PATH_CHAR}+)*
284 while (true) {
285 // \/
286 if (peek() == '/') {
287 consume();
288 // {PATH_CHAR}+
289 if (!eof() && isPathChar(peekUnwrap())) {
290 while (!eof() && isPathChar(peekUnwrap()))
291 consume();
292 continue;
293 }
294 }
295 break;
296 }
297 if (peek() == '>') {
298 consume();
299 return true;
300 }
301 }
302
303 Cur = Saved;
304 return false;
305}
306
307void Lexer::lexIdentifier() {
308 // identifier: [a-zA-Z_][a-zA-Z0-9_\'\-]*,
309 consume();
310 while (!eof() && isIdentifierChar(peekUnwrap()))
311 consume();
312}
313
314void Lexer::maybeKW() {
315 // For complex language this should be done on automaton or hashtable.
316 // But actually there are few keywords in nix language, so we just do
317 // comparison.
318#define TOK_KEYWORD(NAME) \
319 if (tokStr() == #NAME) { \
320 Tok = tok_kw_##NAME; \
321 return; \
322 }
324#undef TOK_KEYWORD
325}
326
328 // Accept all characters, except ${, or "
329 // aaa/b//c
330 // Path
331 // PathFragment aaa/ <- lex()
332 // PathFragment b//c <- lexPath()
333 startToken();
334 Tok = tok_path_end;
335 if (eof()) {
336 return finishToken();
337 }
338
339 if (consumePrefix("${")) {
340 Tok = tok_dollar_curly;
341 return finishToken();
342 }
343
344 if (isPathChar(peekUnwrap()) || peekUnwrap() == '/') {
345 Tok = tok_path_fragment;
346 while (!eof() && (isPathChar(peekUnwrap()) || peekUnwrap() == '/')) {
347 // Encountered an interpolation, stop here
348 if (peekPrefix("${"))
349 break;
350 consume();
351 }
352 return finishToken();
353 }
354 return finishToken();
355}
356
358 // Accept all characters, except ${, or "
359 startToken();
360 if (eof()) {
361 Tok = tok_eof;
362 return finishToken();
363 }
364 switch (peekUnwrap()) {
365 case '"':
366 consume();
367 Tok = tok_dquote;
368 break;
369 case '\\':
370 // Consume two characters, for escaping
371 // NOTE: we may not want to break out Unicode wchar here, but libexpr does
372 // such ignoring
373 consume(2);
374 Tok = tok_string_escape;
375 break;
376 case '$':
377 if (consumePrefix("${")) {
378 Tok = tok_dollar_curly;
379 break;
380 }
381
382 // Otherwise, consider it is a part of string.
383 [[fallthrough]];
384 default:
385 Tok = tok_string_part;
386 for (; !eof();) {
387 // '\' escape
388 if (peekUnwrap() == '\\')
389 break;
390 if (peekUnwrap() == '"')
391 break;
392 // double-$, or \$, escapes ${.
393 // We will handle escaping on Sema
394 if (consumePrefix("$${"))
395 continue;
396 // Encountered a string interpolation, stop here
397 if (peekPrefix("${"))
398 break;
399 consume();
400 }
401 }
402 return finishToken();
403}
404
406 startToken();
407 if (eof()) {
408 Tok = tok_eof;
409 return finishToken();
410 }
411 if (consumePrefix("''")) {
412 Tok = tok_quote2;
413 if (consumePrefix("$") || consumePrefix("\\") || consumePrefix("'"))
414 Tok = tok_string_escape;
415 return finishToken();
416 }
417
418 if (consumePrefix("${")) {
419 Tok = tok_dollar_curly;
420 return finishToken();
421 }
422
423 Tok = tok_string_part;
424 for (; !eof();) {
425 if (peekPrefix("''"))
426 break;
427 // double-$, or \$, escapes ${.
428 // We will handle escaping on Sema
429 if (consumePrefix("$${"))
430 continue;
431 // Encountered a string interpolation, stop here
432 if (peekPrefix("${"))
433 break;
434 consume();
435 }
436 return finishToken();
437}
438
440 // eat leading trivia
441 consumeTrivia();
442 startToken();
443
444 std::optional<char> Ch = peek();
445
446 if (!Ch) {
447 Tok = tok_eof;
448 return finishToken();
449 }
450
451 // Determine if this is a path, or identifier.
452 // a/b (including 1/2) should be considered as a whole path, not (a / b)
453 if (isPathChar(*Ch) || *Ch == '/') {
454 if (consumePathStart()) {
455 // Form a concret token, this is a path part.
456 Tok = tok_path_fragment;
457 return finishToken();
458 }
459 }
460
461 // Determine if this is a URI.
462 if (std::isalpha(*Ch)) {
463 if (consumeURI()) {
464 Tok = tok_uri;
465 return finishToken();
466 }
467 }
468
469 if (std::isdigit(*Ch)) {
470 lexNumbers();
471 return finishToken();
472 }
473
474 if (std::isalpha(*Ch) || *Ch == '_') {
475
476 // So, this is not a path/URI, it should be an identifier.
477 lexIdentifier();
478 Tok = tok_id;
479 maybeKW();
480 return finishToken();
481 }
482
483 if (*Ch == '<') {
484 // Perhaps this is an "SPATH".
485 // e.g. <nixpkgs>
486 // <{PATH_CHAR}+(\/{PATH_CHAR}+)*>
487 if (consumeSPath()) {
488 Tok = tok_spath;
489 return finishToken();
490 }
491 }
492
493 switch (*Ch) {
494 case '\'':
495 if (consumePrefix("''"))
496 Tok = tok_quote2;
497 break;
498 case '+':
499 if (consumePrefix("++")) {
500 Tok = tok_op_concat;
501 } else {
502 consume();
503 Tok = tok_op_add;
504 }
505 break;
506 case '-':
507 if (consumePrefix("->")) {
508 Tok = tok_op_impl;
509 } else {
510 consume();
511 Tok = tok_op_negate;
512 }
513 break;
514 case '*':
515 consume();
516 Tok = tok_op_mul;
517 break;
518 case '/':
519 if (consumePrefix("//")) {
520 Tok = tok_op_update;
521 } else {
522 consume();
523 Tok = tok_op_div;
524 }
525 break;
526 case '|':
527 if (consumePrefix("||"))
528 Tok = tok_op_or;
529 if (consumePrefix("|>"))
530 Tok = tok_op_pipe_into;
531 break;
532 case '!':
533 if (consumePrefix("!=")) {
534 Tok = tok_op_neq;
535 } else {
536 consume();
537 Tok = tok_op_not;
538 }
539 break;
540 case '<':
541 if (consumePrefix("<=")) {
542 Tok = tok_op_le;
543 } else if (consumePrefix("<|")) {
544 Tok = tok_op_pipe_from;
545 } else {
546 consume();
547 Tok = tok_op_lt;
548 }
549 break;
550 case '>':
551 if (consumePrefix(">=")) {
552 Tok = tok_op_ge;
553 } else {
554 consume();
555 Tok = tok_op_gt;
556 }
557 break;
558 case '&':
559 if (consumePrefix("&&")) {
560 Tok = tok_op_and;
561 break;
562 }
563 break;
564 case '"':
565 consume();
566 Tok = tok_dquote;
567 break;
568 case '}':
569 consume();
570 Tok = tok_r_curly;
571 break;
572 case '.':
573 if (consumePrefix("...")) {
574 Tok = tok_ellipsis;
575 break;
576 } else {
577 consume();
578 Tok = tok_dot;
579 break;
580 }
581 case '@':
582 consume();
583 Tok = tok_at;
584 break;
585 case ':':
586 consume();
587 Tok = tok_colon;
588 break;
589 case '?':
590 consume();
591 Tok = tok_question;
592 break;
593 case ';':
594 consume();
595 Tok = tok_semi_colon;
596 break;
597 case '=':
598 if (consumePrefix("==")) {
599 Tok = tok_op_eq;
600 break;
601 } else {
602 consume();
603 Tok = tok_eq;
604 break;
605 }
606 case '{':
607 consume();
608 Tok = tok_l_curly;
609 break;
610 case '(':
611 consume();
612 Tok = tok_l_paren;
613 break;
614 case ')':
615 consume();
616 Tok = tok_r_paren;
617 break;
618 case '[':
619 consume();
620 Tok = tok_l_bracket;
621 break;
622 case ']':
623 consume();
624 Tok = tok_r_bracket;
625 break;
626 case ',':
627 consume();
628 Tok = tok_comma;
629 break;
630 case '$':
631 if (consumePrefix("${")) {
632 Tok = tok_dollar_curly;
633 break;
634 }
635 break;
636 }
637 if (Tok == tok_unknown)
638 consume();
639 return finishToken();
640}
Diagnostic::DiagnosticKind DK
Definition Lexer.cpp:39
Lexer declaration. The lexer is a "stateful" lexer and highly tied to parser.
Note & note(Note::NoteKind Kind, LexerCursorRange Range)
Definition Diagnostic.h:197
Fix & fix(std::string Message)
Definition Diagnostic.h:203
Fix & edit(TextEdit Edit)
Definition Diagnostic.h:65
A point in the source file.
Definition Range.h:57
Token lex()
Definition Lexer.cpp:439
Token lexIndString()
Definition Lexer.cpp:405
Token lexPath()
Definition Lexer.cpp:327
Token lexString()
Definition Lexer.cpp:357
const LexerCursor & cur() const
Definition Lexer.h:131
NoteKind
Internal kind.
Definition Diagnostic.h:117
static TextEdit mkInsertion(LexerCursor P, std::string NewText)
Definition Diagnostic.h:35
A token. With it's kind, and the range in source code.
Definition Token.h:55