root/trunk/blade/SyntaxTree.d

Revision 176, 17.8 kB (checked in by Don Clugston, 7 years ago)

SyntaxTree? now creates a tuple of the types. This opens the way to significant simplification of some of the code (eg, I should be able to remove the rank stuff from SyntaxTree?).

  • Property svn:eol-style set to native
Line 
1 // Written in the D programming language 1.0
2 /**
3 * Given an expression as a compile-time constant, create an abstract syntax tree
4 * with semantic information, at compile time.
5 * Part of BLADE : Basic Linear Algebra D Expressions
6 *
7 * Author: Don Clugston
8 * License: Public Domain
9 *
10 * THEORY:
11 * Nested mixins are used. One mixin tricks the compiler into
12 * converting the expression into a standard form with precedence and
13 * associativity information. The inner mixin determines the type and value
14 * of each symbol in the expression, by mixing in '.stringof' expressions.
15 * The '.stringof' property is a very powerful language feature which allows
16 * us to probe the compiler's symbol table. Because it can be applied to any
17 * expression or type, we can get away with a very rudimentary lexer.
18 *
19 * BUGS:
20 *  The following syntax is not supported by mixin_getPrecedence():
21 *       unary operators & !
22 *       && and || operators
23 *       Comparision operators, NCEG floating point operators, is, !is
24 *       ?:
25 *       cast
26 *       new, delete, anonymous classes, functions and delegates
27 *       mixin, assert, and import expressions
28 *       D 2.004 string literals.
29 *
30 * An unmatched ] in the expression generates a garbage error message after the
31 * sensible error message; this is a compiler bug.
32 *
33 * NOTES:
34 *  Functions (including properties) are typed as function pointers in the symbol table.
35 * ERROR HANDLING:
36 *  If the expression refers to a non-existent variable, the type will be an empty string,
37 *  and the value will be the name of the variable.
38 *  Syntax errors in the expression generate normal D errors in the user code, except
39 *  for mismatched parentheses.
40 * QUIRKS:
41 *  The syntaxtreeof() function and the Symbol and AST classes must be visible to the user code.
42 */
43 module blade.SyntaxTree;
44 import blade.BladeUtil : enquote;
45
46 public:
47
48 /**
49  * Returns a string, which, when mixed in, lexes, parses, and semantically
50  * analyses the given expression, then creates a struct literal of type
51  * AbstractSyntaxTree, containing a syntax tree and a symbol table.
52  * The symbol table contains the type, and value for each symbol.
53  * If the symbol is an array, the number of dimensions is also returned.
54  *
55  * Syntax trees are represented as placeholder expressions, with associativity
56  * and precedence indicated by parentheses. The placeholders are indices into
57  * the symbol table.
58  *
59  * Typically, this function will be a code generator for a mixin.
60  */
61 char [] syntaxtreeof(char [] expression)
62 {
63     char [][] symbols = replaceSymbolsWithPlaceholders(expression);
64     char [] result = `AbstractSyntaxTree(` ~ mixin_getPrecedence(expression) ~ `,[`;
65        
66     for(int i=0; i<symbols.length; ++i) {
67         if (i>0) result ~= ",";
68         result ~= `Symbol(` ~ mixin_typeOf(symbols[i])
69                       ~ `,` ~ mixin_valueOf(symbols[i])
70                       ~ `,` ~ mixin_rankOf(symbols[i])
71                       ~ `,` ~ mixin_elementOf(symbols[i]) ~ `)`;
72     }   
73    
74     result ~="])";
75     return `mixin("` ~ enquote(result) ~ `")`;
76 }
77
78 /**
79 *  As for syntaxtreeof(), except that it creates a tuple as well as the abstract syntax tree
80 */
81 char [] mixin_tupleAndSyntaxtreeof(char [] funcname, char [] expression)
82 {
83     char [][] symbols = replaceSymbolsWithPlaceholders(expression);
84     char [] result = `AbstractSyntaxTree(` ~ mixin_getPrecedence(expression) ~ `,[`;
85     char [] tuple;
86        
87     for(int i=0; i<symbols.length; ++i) {
88         if (i>0) {
89             result ~= ",";
90             tuple ~= ",";
91         }
92         result ~= `Symbol(` ~ mixin_typeOf(symbols[i])
93                       ~ `,` ~ mixin_valueOf(symbols[i])
94                       ~ `,` ~ mixin_rankOf(symbols[i])
95                       ~ `,` ~ mixin_elementOf(symbols[i]) ~ `)`;
96         tuple ~= mixin_typeOfOrVoid(symbols[i]);
97     }   
98    
99     result ~="])";
100     return funcname ~ `!(mixin("` ~ enquote(tuple) ~ `"))(mixin("` ~ enquote(result) ~ `"))`;
101 }
102
103 /** The result of semantic analysis of the original expression
104  *
105  */
106 struct AbstractSyntaxTree {
107     char [] expression; /// syntax tree in Placeholder format, eg A+=(B*C)
108     Symbol[] symbolTable; /// Textual form of the types and values of A,B,C,...
109 }
110
111 /** The values of the A, B, C, ... placeholders
112 *
113 * Members:
114 * type    the name of the type, as text
115 * value   The value, as text. This will be either a symbol name, or a literal.
116 *         Note that if it is a numeric or text literal, value[0] will be one of
117 *         the characters 0123456789"'-
118 * rank    The tensor rank. '0' = scalar, '1' = vector, '2' = matrix,
119 *         '3' = tensor of rank 3 or more.
120 * element The base type of each element. (eg, for "double" for double[][5]).
121 *         Not valid for tensors of rank > 3.       
122 */
123 struct Symbol {
124     char [] type;  /// the name of the type, as text
125     char [] value; /// the value, as text.
126     char rank;      /// the tensor rank ('0'=scalar, 1=vector, 2 = matrix, 3=tensor)
127     char [] element; /// the type of each element of the vector or matrix
128 }
129
130 private:
131
132 //  ==== LEXER ====
133 /**
134  * Extract all terminal symbols(identifiers, type names, and literals)
135  * from expression expr.
136  * Replace each identifier with a placeholder A, B, C, D, ...
137  *
138  * Returns: an array of symbols (with no duplicates, though aliasing may exist)
139  *
140  * Supports *all* D syntax, except D2.004 string literals.
141  */
142 char [][] replaceSymbolsWithPlaceholders(inout char [] expr)
143 {
144     char [] code = "";
145     char [][] symbols = [];
146     expr ~= " "; // ensure that it ends.
147     char quote=0; // character which marks end of string literal, 0 if none
148     int startOfSymbol=0;
149     for(int i=0; i<expr.length; ++i) {
150         char c = expr[i];
151         // Deal with string literals
152         if (quote != 0) {
153             if (c == quote && (i>0 || expr[i-1]!='\\')) quote=0;
154             continue;
155         } else if (c == '"' || c == '`' || c == '\'') { // Is a string literal beginning?
156             quote = c;
157             continue;
158         }
159         // Identifiers and numeric literals are a..zA..Z_ or non-ASCII.
160         // + and - are part of the symbol, if it is a floating-point literal   
161         // . is part of a symbol, unless it is a ".." slice
162         if ((c >='a' && c <= 'z') || (c>='A' && c<='Z')
163                 || (c>='0' && c<='9') || (c=='_') || (c>0x7F)
164            || ((c=='+' || c=='-') && i>0 && (expr[startOfSymbol]>='0' && expr[startOfSymbol]<='9') && (expr[i-1]=='e' ||expr[i-1]=='p'|| expr[i-1]=='E' || expr[i-1]=='P'))
165            || (c=='.' && expr[i+1]!='.')
166         ) continue;
167          // The next character is NOT part of the symbol.
168         char [] symbol = expr[startOfSymbol..i];
169         if (symbol=="is" || symbol=="in") {
170             code ~= symbol; // non-type keywords are NOT symbols
171             symbol="";
172         }
173         if (symbol.length>0) {
174             // Add the new symbol to the symbol table
175             code ~= cast(char)('A' + symbols.length);
176             symbols ~= symbol;               
177         }
178         if (c=='.') { // it was opSlice. Skip the next . as well.
179             i++;
180             code ~= ".";
181         }
182         if (c!='/' || i >= expr.length-1) {
183             code ~= c;
184         } else {
185             // Ignore comments.
186             if (expr[i..i+2]=="//") {
187                 for ( ; i<expr.length && expr[i]!='\n' ; ++i) {}
188             } else if (expr[i..i+2]=="/*") {
189                 i+=3;
190                 for ( ; i<expr.length && expr[i-1..i+1] != "*/"; ++i) {}
191                 if ( i >= expr.length-1 ) break;
192             } else if (expr[i..i+2]=="/+") {
193                 i++;
194                 for (int nest = 1; nest>0 && i < expr.length-1; ++i) {
195                     if (expr[i..i+2] == "+/") { --nest; ++i; }
196                     else if (expr[i..i+2] == "/+") { ++nest; ++i; }
197                 }
198                 --i;
199                 if (i >= expr.length-1) break;
200             } else code~=c;
201         }
202         startOfSymbol = i+1;
203     }
204     if (code[$-1]==' ') code=code[0..$-1]; // remove the " " we added.
205     expr = code;
206     return symbols;
207 }
208
209 //  ==== SYNTAX PASS ====
210
211 /** Generate a syntax tree
212
213 We trick the compiler into generating a syntax tree for us. We replace
214 every symbol x with a variable of type AST!("x"), which supports all possible
215 operator overloads. All that the overloaded operators do, is record which
216 operation was performed. This gives the associativity and precedence rules
217 which the compiler used.
218
219 Extension: This code allows opSlice and opIndex to be mixed, even though it is
220 not yet allowed in normal D code.
221 */
222 class AST(char [] expr)
223 {
224     alias expr text;
225     // Binary, pre- and post- operators
226     template binOp(char [] op, T) {
227         alias AST!("(" ~ text ~ op ~ T.text ~ ")") binOp;
228     }
229     template preOp(char [] op) {
230         alias AST!("(" ~ op ~ text ~ ")") preOp;
231     }
232     template postOp(char [] op) {
233         alias AST!("(" ~ text ~ op ~ ")") postOp;
234     }
235    
236     // Simple binary operators
237     binOp!("+", T) opAdd(T)(T x){ return null; }
238     binOp!("-", T) opSub(T)(T x){ return null; }
239     binOp!("*", T) opMul(T)(T x){ return null; }
240     binOp!("/", T) opDiv(T)(T x){ return null; }
241     binOp!("%", T) opMod(T)(T x){ return null; }
242     binOp!("~", T) opCat(T)(T x){ return null; }
243     binOp!("&", T) opAnd(T)(T x){ return null; }
244     binOp!("|", T) opOr(T)(T x) { return null; }
245     binOp!("^", T) opXor(T)(T x){ return null; }
246     binOp!("<<", T)  opShl(T)(T x)  { return null; }
247     binOp!(">>", T)  opShr(T)(T x)  { return null; }
248     binOp!(">>>", T) opUShr(T)(T x) { return null; }
249     binOp!("in", T) opIn(T)(T x)   { return null; }
250     binOp!("=", T)  opAssign(T)(T x){ return null; }
251     binOp!("+=", T) opAddAssign(T)(T x){ return null; }
252     binOp!("-=", T) opSubAssign(T)(T x){ return null; }
253     binOp!("*=", T) opMulAssign(T)(T x){ return null; }
254     binOp!("/=", T) opDivAssign(T)(T x){ return null; }
255     binOp!("%=", T) opModAssign(T)(T x){ return null; }
256     binOp!("~=", T) opCatAssign(T)(T x){ return null; }
257     binOp!("&=", T) opAndAssign(T)(T x){ return null; }
258     binOp!("|=", T) opOrAssign(T)(T x) { return null; }
259     binOp!("^=", T) opXorAssign(T)(T x){ return null; }
260     binOp!("<<=", T)  opShlAssign(T)(T x)  { return null; }
261     binOp!(">>=", T)  opShrAssign(T)(T x)  { return null; }
262     binOp!(">>>=", T) opUShrAssign(T)(T x) { return null; }
263     // Pre-inc operators  are special cases of +=,-=.
264     preOp!("++") opAddAssign(T:int=int)(int x){ return null; }
265     preOp!("--") opSubAssign(T:int=int)(int x){ return null; } 
266     // OpSlice is combined with opIndex.
267 //    AST!("(" ~ text ~ "[" ~ T.text ~ ".." ~ U.text ~ "])") opSlice(T, U)(T x, U y){ return null; }   
268     AST!("(" ~ text ~ "[" ~ AllText!(T) ~ "])") opIndex(T...)(T x){ return null; }
269     AST!("((" ~ text ~ "[" ~ AllText!(T) ~ "])=" ~ U.text ~ ")") opIndexAssign(U, T...)(U,T){ return null; }
270     AST!("(" ~ text ~ "(" ~ AllText!(T) ~ "))") opCall(T...)(T){ return null; }
271
272     // Avoid infinite recursion by templating these functions.
273     preOp!("~") opCom(dummy=void)(){ return null; }
274     preOp!("+") opPos(dummy=void)(){ return null; }
275     preOp!("-") opNeg(dummy=void)(){ return null; }
276     preOp!("*") opStar(dummy=void)(){ return null; }  // D2.0 only
277
278     postOp!("[]") opSlice(dummy=void)(){ return null; }
279     static if (text.length>3 && text[$-3..$]!="++)" && text[$-3..$]!="--)") {
280         postOp!("++") opPostInc(){ return null; }
281         postOp!("--") opPostDec(){ return null; }
282     }
283 // Unfortunately, opCast doesn't work, because the return type must be the same as typeof(this).
284 // Likewise, opEquals and opCmp must return an int.
285 // Other operators are not overloadable
286 }
287
288 // Helper for opCall() etc
289 template AllText(T...)
290 {
291     static if (T.length==0) const char [] AllText = "";
292     else static if (T.length==1) const char [] AllText = T[0].text;
293     else static if (T.length>2 && T[1].text=="..") // Convert back to a slice
294         const char [] AllText = T[0].text ~ ".." ~ AllText!(T[2..$]);
295     else const char [] AllText = T[0].text ~ "," ~ AllText!(T[1..$]);
296 }
297
298 /**
299  * Insert parentheses around each symbol, to enforce normal D precedence rules.
300  * Returns: a string which, when mixed in, generates a parenthesised expression.
301  */
302 char [] mixin_getPrecedence(char [] expr)
303 {    
304     if (expr.length<2) return "`" ~ expr ~ "`";
305     // The scheme doesn't work directly for array literals. Instead, change them
306     // into opIndex of a nameless `` symbol.
307     bool lastWasSymbol=false; // hack for array literals
308     char [] code = "typeof(";
309         for(int i=0; i<expr.length; ++i) {
310             if (expr[i]>='A' && expr[i]<='Z' || expr[i]=='$') {
311                 code ~= "(cast(AST!(`" ~ expr[i] ~"`))(null))";
312                 lastWasSymbol = true;
313             } else {               
314                 if (!lastWasSymbol && expr[i]=='[') {
315                     code ~= "(cast(AST!(``))(null))";
316                 }
317                 if (expr[i]!=' ' && expr[i]!='\t' && expr[i]!='\r'
318                     && expr[i]!= '\n' && expr[i]!=')') lastWasSymbol = false;
319                 if (expr[i]==']') lastWasSymbol=true;
320                 if (i<expr.length-1 && expr[i..i+2]=="..") {
321                     code ~= ",(cast(AST!(`..`))(null)),";
322                     ++i;
323                 }
324                 else code ~= expr[i];
325             }
326         }
327     return code ~ ").text[1..$-1]"; // remove the outer ()
328 }
329
330 //  ==== SEMANTIC PASS ====
331
332 // Returns typeof(sym).stringof.
333 char [] mixin_typeOf(char [] sym)
334 {
335     // If sym is a function, we take its address, since
336     //   typeof(x).stringof doesn't compile if x is a function.
337     // If sym doesn't compile at all, return an empty string.
338     return `mixin(!is(typeof(` ~ sym ~ `))?"\"\"" : is(typeof(` ~ sym ~ `)==function)?"` ~ enquote("typeof(&" ~ sym ~ ").stringof")~`":"` ~ enquote("typeof(" ~ sym ~ ").stringof") ~ `")`;
339 }
340
341 char [] mixin_typeOfOrVoid(char [] sym)
342 {
343     // If sym is a function, we take its address, since
344     //   typeof(x).stringof doesn't compile if x is a function.
345     // If sym doesn't compile at all, return "void".
346     return `mixin(!is(typeof(` ~ sym ~ `))?"\"void\"" : is(typeof(` ~ sym ~ `)==function)?"` ~ enquote("typeof(&" ~ sym ~ ").stringof")~`":"` ~ enquote("typeof(" ~ sym ~ ").stringof") ~ `")`;
347 }
348
349 // Returns sym.stringof, with workarounds for compiler bugs
350 char [] mixin_valueOf(char [] sym)
351 {
352     // This function would just return sym.stringof, except that:
353     // (1) 1.23.stringof fails to compile (compiler bug).
354     //  Fortunately (1.23).stringof works.
355     // (2) x.stringof doesn't compile, if x is a function. Just return x instead.
356     //     We also return x if x doesn't compile (eg, is an undefined variable).
357     if (sym[0]>='0' && sym[0]<='9') { // numeric literal
358         return `(` ~ sym ~ `).stringof`;
359     }
360     return `mixin(is(typeof(` ~ sym ~ `)==function) || !is(typeof(` ~ sym ~ `))?"\"` ~ enquote(sym, `\\\`) ~ `\"":"` ~ enquote(sym ~ ".stringof") ~ `")`;
361 }
362
363 // BLADE-SPECIFIC SEMANTIC PASS
364
365 // When mixed in, creates a const char describing the (tensor) rank of symbol sym.
366 // Possible results are 0 = scalar, 1 = vector, 2 = matrix, 3 = tensor of rank 3 or more.
367 char [] mixin_rankOf(char [] sym)
368 {
369     // Implementation: If sym[0][0] is a valid type, we know it's a matrix.
370     // else if sym[0] is a valid type, we know it's a vector. Etc.
371     return "is(typeof(" ~ sym ~ "[0]))?is(typeof(" ~ sym ~ "[0][0]))?is(typeof(" ~ sym ~ "[0][0][0]))?'3':'2':'1':'0'";
372 }
373
374 char [] mixin_elementOf(char [] sym)
375 {
376     // Implementation: If sym[0][0] is a valid type, we know it's a matrix.
377     // else if sym[0] is a valid type, we know it's a vector. Etc.
378     char [] s = enquote(sym);
379     return mixin_typeOf("mixin(is(typeof(" ~ sym ~ "[0]))?is(typeof(" ~ sym ~ "[0][0]))?is(typeof(" ~ sym ~ `[0][0][0]))?"`
380     ~ s ~ `[0][0][0]":"` ~ s ~ `[0][0]":"` ~ s ~ `[0]":"` ~ s ~ `")`);   
381 }
382
383 //  ==== TESTS ====
384
385 unittest {
386     char [] expr = "xyzzy+ y/+ comment/++ /++//++/+/ +/*2";
387     char [][] symbols = replaceSymbolsWithPlaceholders(expr);
388     assert(expr == "A+ B*C");
389    
390     assert(mixin(mixin_getPrecedence("D -= A+B *C"))=="D-=(A+(B*C))");
391     assert(mixin(mixin_getPrecedence("D -= ++A"))=="D-=(++A)");
392     assert(mixin(mixin_getPrecedence("A[B,$-C/D]=E+F"))=="(A[B,($-(C/D))])=(E+F)");
393     assert(mixin(mixin_getPrecedence("A[B]=E+F"))=="(A[B])=(E+F)");
394     assert(mixin(mixin_getPrecedence("A+=B[E]"))=="A+=(B[E])");
395     assert(mixin(mixin_getPrecedence("A[B][$]=A[E]+F"))=="((A[B])[$])=((A[E])+F)");
396     assert(mixin(mixin_getPrecedence("G-=A[B][C..B^D][D]*E+F"))=="G-=(((((A[B])[C..(B^D)])[D])*E)+F)");
397     assert(mixin(mixin_getPrecedence("E=A[B,C/D]*F"))=="E=((A[B,(C/D)])*F)");
398     assert(mixin(mixin_getPrecedence("(A+B)  in(C^D)"))=="(A+B)in(C^D)");
399     assert(mixin(mixin_getPrecedence("A"))=="A");
400     assert(mixin(mixin_getPrecedence("--A"))=="--A");
401 //   assert(mixin(mixin_getPrecedence("A--"))=="A--"); // BUG: fails
402    
403     assert(mixin(mixin_getPrecedence("A[B,[C,D]]"))=="A[B,([C,D])]");
404     assert(mixin(mixin_getPrecedence("A[B,C..D]"))=="A[B,C..D]");
405     assert(mixin(mixin_getPrecedence("A[B][C,D]"))=="(A[B])[C,D]");
406     assert(mixin(mixin_getPrecedence("A[B,([C,D])]"))=="A[B,([C,D])]");
407     assert(mixin(mixin_getPrecedence("A([B,C])"))=="A(([B,C]))");   
408 }
409 /+   // NO LONGER USED
410 /** Return true if text is a non-value, non-type D keyword
411  */
412 bool isNonTypeKeyword(char [] text) {
413     char [][] keywords = [
414     "abstract", "alias", "align", "asm", "assert", "auto", "body", "break",
415     "case", "cast", "catch", "class", "const", "continue", "dchar", "debug",
416     "default", "delegate", "delete", "deprecated", "do", "else", "enum",
417     "export", "extern", "final", "finally", "for", "foreach", "foreach_reverse",
418     "function", "goto", "if", "import", "in", "inout", "interface", "invariant",
419     "is", "lazy", "macro", "mixin", "module", "new", "out", "override",
420     "package", "pragma", "private", "protected", "public", "ref", "return",
421     "scope", "static", "struct", "switch", "synchronized", "template", "throw",
422     "__traits", "try", "typedef", "typeid", "typeof", "union", "unittest",
423     "version", "volatile", "while", "with" ];
424     
425     foreach(char [] s; keywords) {
426         if (s==text) return true;
427     }
428     return false;
429 }
430 +/
Note: See TracBrowser for help on using the browser.