1 /**
2   * DJinja lexer
3   *
4   * Copyright:
5   *     Copyright (c) 2018, Maxim Tyapkin.
6   * Authors:
7   *     Maxim Tyapkin
8   * License:
9   *     This software is licensed under the terms of the BSD 3-clause license.
10   *     The full terms of the license can be found in the LICENSE.md file.
11   */
12 
13 module djinja.lexer;
14 
15 
16 private
17 {
18     import djinja.exception : JinjaException;
19 
20     import std.conv : to;
21     import std.traits : EnumMembers;
22     import std.utf;
23     import std.range;
24 }
25 
26 
27 enum Type
28 {
29     Unknown,
30     Raw,
31     Keyword,
32     Operator,
33     
34     StmtBegin,
35     StmtEnd,
36     ExprBegin,
37     ExprEnd,
38     CmntBegin,
39     CmntEnd,
40     CmntInline,
41 
42     Ident,
43     Integer,
44     Float,
45     Boolean,
46     String,
47 
48     LParen,
49     RParen,
50     LSParen,
51     RSParen,
52     LBrace,
53     RBrace,
54 
55     Dot,
56     Comma,
57     Colon,
58 
59     EOL,
60     EOF,
61 }
62 
63 
64 enum Keyword : string
65 {
66     Unknown = "",
67     For = "for",
68     Recursive = "recursive",
69     EndFor = "endfor",
70     If = "if",
71     ElIf = "elif",
72     Else = "else",
73     EndIf = "endif",
74     Block = "block",
75     EndBlock = "endblock",
76     Extends = "extends",
77     Macro = "macro",
78     EndMacro = "endmacro",
79     Return = "return",
80     Call = "call",
81     EndCall = "endcall",
82     Filter = "filter",
83     EndFilter = "endfilter",
84     With = "with",
85     EndWith = "endwith",
86     Set = "set",
87     EndSet = "endset",
88     Ignore = "ignore",
89     Missing = "missing",
90     Import = "import",
91     From = "from",
92     As = "as",
93     Without = "without",
94     Context = "context",
95     Include = "include",
96 }
97 
98 bool isBeginingKeyword(Keyword kw)
99 {
100     import std.algorithm : among;
101 
102     return cast(bool)kw.among(
103                 Keyword.If,
104                 Keyword.Set,
105                 Keyword.For,
106                 Keyword.Block,
107                 Keyword.Extends,
108                 Keyword.Macro,
109                 Keyword.Call,
110                 Keyword.Filter,
111                 Keyword.With,
112                 Keyword.Include,
113                 Keyword.Import,
114                 Keyword.From,
115         );
116 }
117 
118 Keyword toKeyword(string key)
119 {
120     switch (key) with (Keyword)
121     {
122         static foreach(member; EnumMembers!Keyword)
123         {
124             case member:
125                 return member;
126         }
127         default :
128             return Unknown;
129     }
130 }
131 
132 
133 bool isKeyword(string key)
134 {
135     return key.toKeyword != Keyword.Unknown;
136 }
137 
138 
139 bool isBoolean(string key)
140 {
141     return key == "true" || key == "false" ||
142            key == "True" || key == "False";
143 }
144 
145 
146 enum Operator : string
147 {
148     // The first in order is the first in priority
149 
150     Eq = "==",
151     NotEq = "!=",
152     LessEq = "<=",
153     GreaterEq = ">=",
154     Less = "<",
155     Greater = ">",
156 
157     And = "and",
158     Or = "or",
159     Not = "not",
160 
161     In = "in",
162     Is = "is",
163 
164     Assign = "=",
165     Filter = "|",
166     Concat = "~",
167 
168     Plus = "+",
169     Minus = "-",
170 
171     DivInt = "//",
172     DivFloat = "/",
173     Rem = "%",
174     Pow = "**",
175     Mul = "*",
176 }
177 
178 
179 Operator toOperator(string key)
180 {
181     switch (key) with (Operator)
182     {
183         static foreach(member; EnumMembers!Operator)
184         {
185             case member:
186                 return member;
187         }
188         default :
189             return cast(Operator)"";
190     }
191 }
192 
193 bool isOperator(string key)
194 {
195     switch (key) with (Operator)
196     {
197         static foreach(member; EnumMembers!Operator)
198         {
199             case member:
200         }
201                 return true;
202         default :
203             return false;
204     }
205 }
206 
207 bool isCmpOperator(Operator op)
208 {
209     import std.algorithm : among;
210 
211     return cast(bool)op.among(
212             Operator.Eq,
213             Operator.NotEq,
214             Operator.LessEq,
215             Operator.GreaterEq,
216             Operator.Less,
217             Operator.Greater
218         );
219 }
220 
221 
222 bool isIdentOperator(Operator op)()
223 {
224     import std.algorithm : filter;
225     import std.uni : isAlphaNum;
226 
227     static if (!(cast(string)op).filter!isAlphaNum.empty)
228         return true;
229     else
230         return false;
231 }
232 
233 
234 struct Position
235 {
236     string filename;
237     ulong line, column;
238 
239     string toString()
240     {
241         return filename ~ "(" ~ line.to!string ~ "," ~ column.to!string ~ ")";
242     }
243 }
244 
245 
246 struct Token
247 {
248     enum EOF = Token(Type.EOF, Position("", 0, 0));
249 
250     Type type;
251     string value;
252     Position pos;
253 
254     this (Type t, Position p)
255     {
256         type = t;
257         pos = p;
258     }
259 
260     this(Type t, string v, Position p)
261     {
262         type = t;
263         value = v;
264         pos = p;
265     }
266 
267     bool opEquals(Type type){
268         return this.type == type;
269     }
270 
271     bool opEquals(Keyword kw){
272         return this.type == Type.Keyword && value == kw;
273     }
274 
275     bool opEquals(Operator op){
276         return this.type == Type.Operator && value == op;
277     }
278 }
279 
280 
281 struct Lexer(
282         string exprOpBegin, string exprOpEnd,
283         string stmtOpBegin, string stmtOpEnd,
284         string cmntOpBegin, string cmntOpEnd,
285         string stmtOpInline, string cmntOpInline)
286 {
287     static assert(exprOpBegin.length, "Expression begin operator can't be empty");
288     static assert(exprOpEnd.length, "Expression end operator can't be empty");
289 
290     static assert(stmtOpBegin.length, "Statement begin operator can't be empty");
291     static assert(stmtOpEnd.length, "Statement end operator can't be empty");
292 
293     static assert(cmntOpBegin.length, "Comment begin operator can't be empty");
294     static assert(cmntOpEnd.length, "Comment end operator can't be empty");
295 
296     static assert(stmtOpInline.length, "Statement inline operator can't be empty");
297     static assert(cmntOpInline.length, "Comment inline operator can't be empty");
298 
299     //TODO check uniq
300 
301 
302     enum stmtInline = stmtOpInline;
303     enum EOF = 255;
304 
305     private
306     {
307         Position _beginPos;
308         bool _isReadingRaw; // State of reading raw data
309         bool _isInlineStmt; // State of reading inline statement
310         string _str;
311         string _filename;
312         ulong _line, _column;
313     }
314 
315     this(string str, string filename = "")
316     {
317         _str = str;
318         _isReadingRaw = true;
319         _isInlineStmt = false;
320         _filename = filename;
321         _line = 1;
322         _column = 1;
323     }
324 
325     Token nextToken()
326     {
327         _beginPos = position();
328 
329         // Try to read raw data
330         if (_isReadingRaw)
331         {
332             auto raw = skipRaw();
333             _isReadingRaw = false;
334             if (raw.length)
335                 return Token(Type.Raw, raw, _beginPos);
336         }
337 
338         skipWhitespaces();
339         _beginPos = position();
340 
341         // Check inline statement end
342         if (_isInlineStmt &&
343             (tryToSkipNewLine() || cmntOpInline == sliceOp!cmntOpInline))
344         {
345             _isInlineStmt = false;
346             _isReadingRaw = true;
347             return Token(Type.StmtEnd, "\n", _beginPos);
348         }
349 
350         // Allow multiline inline statements with '\'
351         while (true)
352         {
353             if (_isInlineStmt && front == '\\')
354             {
355                 pop();
356                 if (!tryToSkipNewLine())
357                     return Token(Type.Unknown, "\\", _beginPos);
358             }
359             else
360                 break;
361 
362             skipWhitespaces();
363             _beginPos = position();
364         }
365 
366         // Check begin operators
367         if (exprOpBegin == sliceOp!exprOpBegin)
368         {
369             skipOp!exprOpBegin;
370             return Token(Type.ExprBegin, exprOpBegin, _beginPos);
371         }
372         if (stmtOpBegin == sliceOp!stmtOpBegin)
373         {
374             skipOp!stmtOpBegin;
375             return Token(Type.StmtBegin, stmtOpBegin, _beginPos);
376         }
377         if (cmntOpBegin == sliceOp!cmntOpBegin)
378         {
379             skipOp!cmntOpBegin;
380             skipComment();
381             return Token(Type.CmntBegin, cmntOpBegin, _beginPos);
382         }
383 
384         // Check end operators
385         if (exprOpEnd == sliceOp!exprOpEnd)
386         {
387             _isReadingRaw = true;
388             skipOp!exprOpEnd;
389             return Token(Type.ExprEnd, exprOpEnd, _beginPos);
390         }
391         if (stmtOpEnd == sliceOp!stmtOpEnd)
392         {
393             _isReadingRaw = true;
394             skipOp!stmtOpEnd;
395             return Token(Type.StmtEnd, stmtOpEnd, _beginPos);
396         }
397         if (cmntOpEnd == sliceOp!cmntOpEnd)
398         {
399             _isReadingRaw = true;
400             skipOp!cmntOpEnd;
401             return Token(Type.CmntEnd, cmntOpEnd, _beginPos);
402         }
403 
404         // Check begin inline operators
405         if (cmntOpInline == sliceOp!cmntOpInline)
406         {
407             skipInlineComment();
408             _isReadingRaw = true;
409             return Token(Type.CmntInline, cmntOpInline, _beginPos);
410         }
411         if (stmtOpInline == sliceOp!stmtOpInline)
412         {
413             skipOp!stmtOpInline;
414             _isInlineStmt = true;
415             return Token(Type.StmtBegin, stmtOpInline, _beginPos);
416         }
417 
418         // Trying to read non-ident operators
419         static foreach(op; EnumMembers!Operator)
420         {
421             static if (!isIdentOperator!op)
422             {
423                 if (cast(string)op == sliceOp!op)
424                 {
425                     skipOp!op;
426                     return Token(Type.Operator, op, _beginPos);
427                 }
428             }
429         }
430 
431         // Check remainings 
432         switch (front)
433         {
434             // End of file
435             case EOF:
436                 return Token(Type.EOF, _beginPos);
437 
438 
439             // Identifier or keyword
440             case 'a': .. case 'z':
441             case 'A': .. case 'Z':
442             case '_':
443                 auto ident = popIdent();
444                 if (ident.toKeyword != Keyword.Unknown)
445                     return Token(Type.Keyword, ident, _beginPos);
446                 else if (ident.isBoolean)
447                     return Token(Type.Boolean, ident, _beginPos);
448                 else if (ident.isOperator)
449                     return Token(Type.Operator, ident, _beginPos);
450                 else
451                     return Token(Type.Ident, ident, _beginPos);
452 
453             // Integer or float
454             case '0': .. case '9':
455                 return popNumber();
456 
457             // String
458             case '"':
459             case '\'':
460                 return Token(Type.String, popString(), _beginPos);
461                 
462             case '(': return Token(Type.LParen, popChar, _beginPos);
463             case ')': return Token(Type.RParen, popChar, _beginPos);
464             case '[': return Token(Type.LSParen, popChar, _beginPos);
465             case ']': return Token(Type.RSParen, popChar, _beginPos);
466             case '{': return Token(Type.LBrace, popChar, _beginPos);
467             case '}': return Token(Type.RBrace, popChar, _beginPos);
468             case '.': return Token(Type.Dot, popChar, _beginPos);
469             case ',': return Token(Type.Comma, popChar, _beginPos);
470             case ':': return Token(Type.Colon, popChar, _beginPos);
471 
472             default:
473                 return Token(Type.Unknown, popChar, _beginPos);
474         }
475     }
476 
477 
478 private:
479 
480 
481     dchar front()
482     {
483         if (_str.length > 0)
484             return _str.front;
485         else
486             return EOF;
487     }
488 
489 
490     dchar next()
491     {
492         auto chars = _str.take(2).array;
493         if (chars.length < 2)
494             return EOF;
495         return chars[1];
496     }
497 
498     dchar pop()
499     {
500         if (_str.length > 0)
501         {
502             auto ch  = _str.front;
503 
504             if (ch.isNewLine && !(ch == '\r' && next == '\n'))
505             {
506                 _line++;
507                 _column = 1;
508             }
509             else
510                 _column++;
511 
512             _str.popFront();
513             return ch;
514         } 
515         else
516             return EOF;
517     }
518 
519 
520     string popChar()
521     {
522         return pop.to!string;
523     }
524 
525 
526     string sliceOp(string op)()
527     {
528         enum length = op.walkLength;
529 
530         if (length >= _str.length)
531             return _str;
532         else
533             return _str[0 .. length];
534     }
535 
536 
537     void skipOp(string op)()
538     {
539         enum length = op.walkLength;
540 
541         if (length >= _str.length)
542             _str = "";
543         else
544             _str = _str[length .. $];
545         _column += length;
546     }
547 
548 
549     Position position()
550     {
551         return Position(_filename, _line, _column);
552     }
553 
554 
555     void skipWhitespaces()
556     {
557         while (true)
558         {
559             if (front.isWhiteSpace)
560             {
561                 pop();
562                 continue;
563             }
564 
565             if (isFronNewLine)
566             {
567                 // Return for handling NL as StmtEnd
568                 if (_isInlineStmt)
569                     return;
570                 tryToSkipNewLine();
571                 continue;
572             }
573 
574             return;
575         }
576     }
577 
578 
579     string popIdent()
580     {
581         string ident = "";
582         while (true)
583         {
584             switch(front)
585             {
586                 case 'a': .. case 'z':
587                 case 'A': .. case 'Z':
588                 case '0': .. case '9':
589                 case '_':
590                     ident ~= pop();
591                     break;
592                 default:
593                     return ident;
594             }
595         }
596     }
597 
598 
599     Token popNumber()
600     {
601         auto type = Type.Integer;
602         string number = "";
603 
604         while (true)
605         {
606             switch (front)
607             {
608                 case '0': .. case '9':
609                     number ~= pop();
610                     break;
611                 case '.':
612                     if (type == Type.Integer)
613                     {
614                         type = Type.Float;
615                         number ~= pop();
616                     }
617                     else
618                         return Token(type, number, _beginPos);
619                     break;
620                 case '_':
621                     pop();
622                     break;
623                 default:
624                     return Token(type, number, _beginPos);
625             }
626         }
627     }
628 
629 
630     string popString()
631     {
632         auto ch = pop();
633         string str = "";
634         auto prev = ch;
635 
636         while (true)
637         {
638             if (front == EOF)
639                 return str;
640 
641             if (front == '\\')
642             {
643                 pop();
644                 if (front != EOF)
645                 {
646                     prev = pop();
647                     switch (prev)
648                     {
649                         case 'n': str ~= '\n'; break;
650                         case 'r': str ~= '\r'; break;
651                         case 't': str ~= '\t'; break;
652                         default: str ~= prev; break;
653                     }
654                 }
655                 continue;
656             }
657 
658             if (front == ch)
659             {
660                 pop();
661                 return str;
662             }
663 
664             prev = pop();
665             str ~= prev;
666         }
667     }
668 
669 
670     string skipRaw()
671     {
672         string raw = "";
673 
674         while (true)
675         {
676             if (front == EOF)
677                 return raw;
678 
679             if (exprOpBegin == sliceOp!exprOpBegin)
680                 return raw;
681             if (stmtOpBegin == sliceOp!stmtOpBegin)
682                 return raw;
683             if (cmntOpBegin == sliceOp!cmntOpBegin)
684                 return raw;
685             if (stmtOpInline == sliceOp!stmtOpInline)
686                 return raw;
687             if (cmntOpInline == sliceOp!cmntOpInline)
688                 return raw;
689             
690             raw ~= pop();
691         }
692     }
693 
694 
695     void skipComment()
696     {
697         while(front != EOF)
698         {
699             if (cmntOpEnd == sliceOp!cmntOpEnd)
700                 return;
701             pop();
702         }
703     }
704 
705 
706     void skipInlineComment()
707     {
708         while(front != EOF)
709         {
710             if (front == '\n')
711             {
712                 pop();
713                 return;
714             }
715             pop();
716         }
717     }
718 
719 
720     bool isFronNewLine()
721     {
722         auto ch = front;
723         return ch == '\r' || ch == '\n' || ch == 0x2028 || ch == 0x2029; 
724     }
725 
726     /// true if NL was skiped
727     bool tryToSkipNewLine()
728     {
729         switch (front)
730         {
731             case '\r':
732                 pop();
733                 if (front == '\n')
734                     pop();
735                 return true;
736 
737             case '\n':
738             case 0x2028:
739             case 0x2029:
740                 pop();
741                 return true;
742 
743             default:
744                 return false;
745         }
746     }
747 }
748 
749 
750 bool isWhiteSpace(dchar ch)
751 {
752     return ch == ' ' || ch == '\t' || ch == 0x205F || ch == 0x202F || ch == 0x3000
753            || ch == 0x00A0 || (ch >= 0x2002 && ch <= 0x200B);
754 }
755 
756 bool isNewLine(dchar ch)
757 {
758     return ch == '\r' || ch == '\n' || ch == 0x2028 || ch == 0x2029;
759 }