1 /** 2 * DJinja lexer 3 * 4 * Copyright: 5 * Copyright (c) 2018, Maxim Tyapkin. 6 * Authors: 7 * Maxim Tyapkin 8 * License: 9 * This software is licensed under the terms of the BSD 3-clause license. 10 * The full terms of the license can be found in the LICENSE.md file. 11 */ 12 13 module djinja.lexer; 14 15 16 private 17 { 18 import djinja.exception : JinjaException; 19 20 import std.conv : to; 21 import std.traits : EnumMembers; 22 import std.utf; 23 import std.range; 24 } 25 26 27 enum Type 28 { 29 Unknown, 30 Raw, 31 Keyword, 32 Operator, 33 34 StmtBegin, 35 StmtEnd, 36 ExprBegin, 37 ExprEnd, 38 CmntBegin, 39 CmntEnd, 40 CmntInline, 41 42 Ident, 43 Integer, 44 Float, 45 Boolean, 46 String, 47 48 LParen, 49 RParen, 50 LSParen, 51 RSParen, 52 LBrace, 53 RBrace, 54 55 Dot, 56 Comma, 57 Colon, 58 59 EOL, 60 EOF, 61 } 62 63 64 enum Keyword : string 65 { 66 Unknown = "", 67 For = "for", 68 Recursive = "recursive", 69 EndFor = "endfor", 70 If = "if", 71 ElIf = "elif", 72 Else = "else", 73 EndIf = "endif", 74 Block = "block", 75 EndBlock = "endblock", 76 Extends = "extends", 77 Macro = "macro", 78 EndMacro = "endmacro", 79 Return = "return", 80 Call = "call", 81 EndCall = "endcall", 82 Filter = "filter", 83 EndFilter = "endfilter", 84 With = "with", 85 EndWith = "endwith", 86 Set = "set", 87 EndSet = "endset", 88 Ignore = "ignore", 89 Missing = "missing", 90 Import = "import", 91 From = "from", 92 As = "as", 93 Without = "without", 94 Context = "context", 95 Include = "include", 96 } 97 98 bool isBeginingKeyword(Keyword kw) 99 { 100 import std.algorithm : among; 101 102 return cast(bool)kw.among( 103 Keyword.If, 104 Keyword.Set, 105 Keyword.For, 106 Keyword.Block, 107 Keyword.Extends, 108 Keyword.Macro, 109 Keyword.Call, 110 Keyword.Filter, 111 Keyword.With, 112 Keyword.Include, 113 Keyword.Import, 114 Keyword.From, 115 ); 116 } 117 118 Keyword toKeyword(string key) 119 { 120 switch (key) with (Keyword) 121 { 122 static foreach(member; EnumMembers!Keyword) 123 { 124 case member: 125 return member; 126 } 127 default : 128 return Unknown; 129 } 130 } 131 132 133 bool isKeyword(string key) 134 { 135 return key.toKeyword != Keyword.Unknown; 136 } 137 138 139 bool isBoolean(string key) 140 { 141 return key == "true" || key == "false" || 142 key == "True" || key == "False"; 143 } 144 145 146 enum Operator : string 147 { 148 // The first in order is the first in priority 149 150 Eq = "==", 151 NotEq = "!=", 152 LessEq = "<=", 153 GreaterEq = ">=", 154 Less = "<", 155 Greater = ">", 156 157 And = "and", 158 Or = "or", 159 Not = "not", 160 161 In = "in", 162 Is = "is", 163 164 Assign = "=", 165 Filter = "|", 166 Concat = "~", 167 168 Plus = "+", 169 Minus = "-", 170 171 DivInt = "//", 172 DivFloat = "/", 173 Rem = "%", 174 Pow = "**", 175 Mul = "*", 176 } 177 178 179 Operator toOperator(string key) 180 { 181 switch (key) with (Operator) 182 { 183 static foreach(member; EnumMembers!Operator) 184 { 185 case member: 186 return member; 187 } 188 default : 189 return cast(Operator)""; 190 } 191 } 192 193 bool isOperator(string key) 194 { 195 switch (key) with (Operator) 196 { 197 static foreach(member; EnumMembers!Operator) 198 { 199 case member: 200 } 201 return true; 202 default : 203 return false; 204 } 205 } 206 207 bool isCmpOperator(Operator op) 208 { 209 import std.algorithm : among; 210 211 return cast(bool)op.among( 212 Operator.Eq, 213 Operator.NotEq, 214 Operator.LessEq, 215 Operator.GreaterEq, 216 Operator.Less, 217 Operator.Greater 218 ); 219 } 220 221 222 bool isIdentOperator(Operator op)() 223 { 224 import std.algorithm : filter; 225 import std.uni : isAlphaNum; 226 227 static if (!(cast(string)op).filter!isAlphaNum.empty) 228 return true; 229 else 230 return false; 231 } 232 233 234 struct Position 235 { 236 string filename; 237 ulong line, column; 238 239 string toString() 240 { 241 return filename ~ "(" ~ line.to!string ~ "," ~ column.to!string ~ ")"; 242 } 243 } 244 245 246 struct Token 247 { 248 enum EOF = Token(Type.EOF, Position("", 0, 0)); 249 250 Type type; 251 string value; 252 Position pos; 253 254 this (Type t, Position p) 255 { 256 type = t; 257 pos = p; 258 } 259 260 this(Type t, string v, Position p) 261 { 262 type = t; 263 value = v; 264 pos = p; 265 } 266 267 bool opEquals(Type type){ 268 return this.type == type; 269 } 270 271 bool opEquals(Keyword kw){ 272 return this.type == Type.Keyword && value == kw; 273 } 274 275 bool opEquals(Operator op){ 276 return this.type == Type.Operator && value == op; 277 } 278 } 279 280 281 struct Lexer( 282 string exprOpBegin, string exprOpEnd, 283 string stmtOpBegin, string stmtOpEnd, 284 string cmntOpBegin, string cmntOpEnd, 285 string stmtOpInline, string cmntOpInline) 286 { 287 static assert(exprOpBegin.length, "Expression begin operator can't be empty"); 288 static assert(exprOpEnd.length, "Expression end operator can't be empty"); 289 290 static assert(stmtOpBegin.length, "Statement begin operator can't be empty"); 291 static assert(stmtOpEnd.length, "Statement end operator can't be empty"); 292 293 static assert(cmntOpBegin.length, "Comment begin operator can't be empty"); 294 static assert(cmntOpEnd.length, "Comment end operator can't be empty"); 295 296 static assert(stmtOpInline.length, "Statement inline operator can't be empty"); 297 static assert(cmntOpInline.length, "Comment inline operator can't be empty"); 298 299 //TODO check uniq 300 301 302 enum stmtInline = stmtOpInline; 303 enum EOF = 255; 304 305 private 306 { 307 Position _beginPos; 308 bool _isReadingRaw; // State of reading raw data 309 bool _isInlineStmt; // State of reading inline statement 310 string _str; 311 string _filename; 312 ulong _line, _column; 313 } 314 315 this(string str, string filename = "") 316 { 317 _str = str; 318 _isReadingRaw = true; 319 _isInlineStmt = false; 320 _filename = filename; 321 _line = 1; 322 _column = 1; 323 } 324 325 Token nextToken() 326 { 327 _beginPos = position(); 328 329 // Try to read raw data 330 if (_isReadingRaw) 331 { 332 auto raw = skipRaw(); 333 _isReadingRaw = false; 334 if (raw.length) 335 return Token(Type.Raw, raw, _beginPos); 336 } 337 338 skipWhitespaces(); 339 _beginPos = position(); 340 341 // Check inline statement end 342 if (_isInlineStmt && 343 (tryToSkipNewLine() || cmntOpInline == sliceOp!cmntOpInline)) 344 { 345 _isInlineStmt = false; 346 _isReadingRaw = true; 347 return Token(Type.StmtEnd, "\n", _beginPos); 348 } 349 350 // Allow multiline inline statements with '\' 351 while (true) 352 { 353 if (_isInlineStmt && front == '\\') 354 { 355 pop(); 356 if (!tryToSkipNewLine()) 357 return Token(Type.Unknown, "\\", _beginPos); 358 } 359 else 360 break; 361 362 skipWhitespaces(); 363 _beginPos = position(); 364 } 365 366 // Check begin operators 367 if (exprOpBegin == sliceOp!exprOpBegin) 368 { 369 skipOp!exprOpBegin; 370 return Token(Type.ExprBegin, exprOpBegin, _beginPos); 371 } 372 if (stmtOpBegin == sliceOp!stmtOpBegin) 373 { 374 skipOp!stmtOpBegin; 375 return Token(Type.StmtBegin, stmtOpBegin, _beginPos); 376 } 377 if (cmntOpBegin == sliceOp!cmntOpBegin) 378 { 379 skipOp!cmntOpBegin; 380 skipComment(); 381 return Token(Type.CmntBegin, cmntOpBegin, _beginPos); 382 } 383 384 // Check end operators 385 if (exprOpEnd == sliceOp!exprOpEnd) 386 { 387 _isReadingRaw = true; 388 skipOp!exprOpEnd; 389 return Token(Type.ExprEnd, exprOpEnd, _beginPos); 390 } 391 if (stmtOpEnd == sliceOp!stmtOpEnd) 392 { 393 _isReadingRaw = true; 394 skipOp!stmtOpEnd; 395 return Token(Type.StmtEnd, stmtOpEnd, _beginPos); 396 } 397 if (cmntOpEnd == sliceOp!cmntOpEnd) 398 { 399 _isReadingRaw = true; 400 skipOp!cmntOpEnd; 401 return Token(Type.CmntEnd, cmntOpEnd, _beginPos); 402 } 403 404 // Check begin inline operators 405 if (cmntOpInline == sliceOp!cmntOpInline) 406 { 407 skipInlineComment(); 408 _isReadingRaw = true; 409 return Token(Type.CmntInline, cmntOpInline, _beginPos); 410 } 411 if (stmtOpInline == sliceOp!stmtOpInline) 412 { 413 skipOp!stmtOpInline; 414 _isInlineStmt = true; 415 return Token(Type.StmtBegin, stmtOpInline, _beginPos); 416 } 417 418 // Trying to read non-ident operators 419 static foreach(op; EnumMembers!Operator) 420 { 421 static if (!isIdentOperator!op) 422 { 423 if (cast(string)op == sliceOp!op) 424 { 425 skipOp!op; 426 return Token(Type.Operator, op, _beginPos); 427 } 428 } 429 } 430 431 // Check remainings 432 switch (front) 433 { 434 // End of file 435 case EOF: 436 return Token(Type.EOF, _beginPos); 437 438 439 // Identifier or keyword 440 case 'a': .. case 'z': 441 case 'A': .. case 'Z': 442 case '_': 443 auto ident = popIdent(); 444 if (ident.toKeyword != Keyword.Unknown) 445 return Token(Type.Keyword, ident, _beginPos); 446 else if (ident.isBoolean) 447 return Token(Type.Boolean, ident, _beginPos); 448 else if (ident.isOperator) 449 return Token(Type.Operator, ident, _beginPos); 450 else 451 return Token(Type.Ident, ident, _beginPos); 452 453 // Integer or float 454 case '0': .. case '9': 455 return popNumber(); 456 457 // String 458 case '"': 459 case '\'': 460 return Token(Type.String, popString(), _beginPos); 461 462 case '(': return Token(Type.LParen, popChar, _beginPos); 463 case ')': return Token(Type.RParen, popChar, _beginPos); 464 case '[': return Token(Type.LSParen, popChar, _beginPos); 465 case ']': return Token(Type.RSParen, popChar, _beginPos); 466 case '{': return Token(Type.LBrace, popChar, _beginPos); 467 case '}': return Token(Type.RBrace, popChar, _beginPos); 468 case '.': return Token(Type.Dot, popChar, _beginPos); 469 case ',': return Token(Type.Comma, popChar, _beginPos); 470 case ':': return Token(Type.Colon, popChar, _beginPos); 471 472 default: 473 return Token(Type.Unknown, popChar, _beginPos); 474 } 475 } 476 477 478 private: 479 480 481 dchar front() 482 { 483 if (_str.length > 0) 484 return _str.front; 485 else 486 return EOF; 487 } 488 489 490 dchar next() 491 { 492 auto chars = _str.take(2).array; 493 if (chars.length < 2) 494 return EOF; 495 return chars[1]; 496 } 497 498 dchar pop() 499 { 500 if (_str.length > 0) 501 { 502 auto ch = _str.front; 503 504 if (ch.isNewLine && !(ch == '\r' && next == '\n')) 505 { 506 _line++; 507 _column = 1; 508 } 509 else 510 _column++; 511 512 _str.popFront(); 513 return ch; 514 } 515 else 516 return EOF; 517 } 518 519 520 string popChar() 521 { 522 return pop.to!string; 523 } 524 525 526 string sliceOp(string op)() 527 { 528 enum length = op.walkLength; 529 530 if (length >= _str.length) 531 return _str; 532 else 533 return _str[0 .. length]; 534 } 535 536 537 void skipOp(string op)() 538 { 539 enum length = op.walkLength; 540 541 if (length >= _str.length) 542 _str = ""; 543 else 544 _str = _str[length .. $]; 545 _column += length; 546 } 547 548 549 Position position() 550 { 551 return Position(_filename, _line, _column); 552 } 553 554 555 void skipWhitespaces() 556 { 557 while (true) 558 { 559 if (front.isWhiteSpace) 560 { 561 pop(); 562 continue; 563 } 564 565 if (isFronNewLine) 566 { 567 // Return for handling NL as StmtEnd 568 if (_isInlineStmt) 569 return; 570 tryToSkipNewLine(); 571 continue; 572 } 573 574 return; 575 } 576 } 577 578 579 string popIdent() 580 { 581 string ident = ""; 582 while (true) 583 { 584 switch(front) 585 { 586 case 'a': .. case 'z': 587 case 'A': .. case 'Z': 588 case '0': .. case '9': 589 case '_': 590 ident ~= pop(); 591 break; 592 default: 593 return ident; 594 } 595 } 596 } 597 598 599 Token popNumber() 600 { 601 auto type = Type.Integer; 602 string number = ""; 603 604 while (true) 605 { 606 switch (front) 607 { 608 case '0': .. case '9': 609 number ~= pop(); 610 break; 611 case '.': 612 if (type == Type.Integer) 613 { 614 type = Type.Float; 615 number ~= pop(); 616 } 617 else 618 return Token(type, number, _beginPos); 619 break; 620 case '_': 621 pop(); 622 break; 623 default: 624 return Token(type, number, _beginPos); 625 } 626 } 627 } 628 629 630 string popString() 631 { 632 auto ch = pop(); 633 string str = ""; 634 auto prev = ch; 635 636 while (true) 637 { 638 if (front == EOF) 639 return str; 640 641 if (front == '\\') 642 { 643 pop(); 644 if (front != EOF) 645 { 646 prev = pop(); 647 switch (prev) 648 { 649 case 'n': str ~= '\n'; break; 650 case 'r': str ~= '\r'; break; 651 case 't': str ~= '\t'; break; 652 default: str ~= prev; break; 653 } 654 } 655 continue; 656 } 657 658 if (front == ch) 659 { 660 pop(); 661 return str; 662 } 663 664 prev = pop(); 665 str ~= prev; 666 } 667 } 668 669 670 string skipRaw() 671 { 672 string raw = ""; 673 674 while (true) 675 { 676 if (front == EOF) 677 return raw; 678 679 if (exprOpBegin == sliceOp!exprOpBegin) 680 return raw; 681 if (stmtOpBegin == sliceOp!stmtOpBegin) 682 return raw; 683 if (cmntOpBegin == sliceOp!cmntOpBegin) 684 return raw; 685 if (stmtOpInline == sliceOp!stmtOpInline) 686 return raw; 687 if (cmntOpInline == sliceOp!cmntOpInline) 688 return raw; 689 690 raw ~= pop(); 691 } 692 } 693 694 695 void skipComment() 696 { 697 while(front != EOF) 698 { 699 if (cmntOpEnd == sliceOp!cmntOpEnd) 700 return; 701 pop(); 702 } 703 } 704 705 706 void skipInlineComment() 707 { 708 while(front != EOF) 709 { 710 if (front == '\n') 711 { 712 pop(); 713 return; 714 } 715 pop(); 716 } 717 } 718 719 720 bool isFronNewLine() 721 { 722 auto ch = front; 723 return ch == '\r' || ch == '\n' || ch == 0x2028 || ch == 0x2029; 724 } 725 726 /// true if NL was skiped 727 bool tryToSkipNewLine() 728 { 729 switch (front) 730 { 731 case '\r': 732 pop(); 733 if (front == '\n') 734 pop(); 735 return true; 736 737 case '\n': 738 case 0x2028: 739 case 0x2029: 740 pop(); 741 return true; 742 743 default: 744 return false; 745 } 746 } 747 } 748 749 750 bool isWhiteSpace(dchar ch) 751 { 752 return ch == ' ' || ch == '\t' || ch == 0x205F || ch == 0x202F || ch == 0x3000 753 || ch == 0x00A0 || (ch >= 0x2002 && ch <= 0x200B); 754 } 755 756 bool isNewLine(dchar ch) 757 { 758 return ch == '\r' || ch == '\n' || ch == 0x2028 || ch == 0x2029; 759 }