Changeset 187
- Timestamp:
- 04/30/08 16:05:32 (4 months ago)
- Files:
-
- trunk/blade/Blade.d (modified) (33 diffs)
- trunk/blade/BladeDemo.d (modified) (5 diffs)
- trunk/blade/BladeRank.d (modified) (9 diffs)
- trunk/blade/BladeSimplify.d (modified) (25 diffs)
- trunk/blade/CodegenX86.d (modified) (27 diffs)
- trunk/blade/PostfixX86.d (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/blade/Blade.d
r176 r187 1 1 // Written in the D programming language 1.0 2 2 /** 3 * BLADE 0. 4Alpha -- Basic Linear Algebra D Expressions3 * BLADE 0.6Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 5 * Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. … … 13 13 * 14 14 * FEATURES: 15 * - Supports any mix of vector addition, subtraction, dot product, unary minus, 16 * multiplication by a scalar, sum(), abs(), and multidimensional slicing. 15 * - Supports any mix of vector addition, subtraction, unary minus, 16 * multiplication by a scalar, 17 * cumulation via dot product, sum() and prod(), and multidimensional slicing. 17 18 * - Generates either x87 asm code, SSE or SSE2 asm code or pure D, depending on 18 19 * the complexity of the expression, and the availability of inline asm. … … 24 25 * 25 26 * SPEED/ACCURACY TRADEOFF: 26 * IEEE floating point multiplication and addition are not associative.27 * Tradeoff arises because IEEE floating point multiplication and addition are not associative. 27 28 * Assuming that overflow and underflow do not occur: 28 29 * (a*b)*c may differ from a*(b*c) in the last bit. … … 34 35 * 35 36 * FUTURE DIRECTIONS (in order of expected implementation): 36 * - trace() 37 * - Loop unrolling for cumulative operations dot, sum, trace. 37 * - nested D expressions 38 * - cumulative operations min, max 39 * - Loop unrolling for cumulative operations dot, sum, prod. 38 40 * - Dense matrix support. 39 41 * - Triangular, banded, symmetric, and sparse matrix support … … 46 48 * which accepts the tuple. 47 49 * 50 * COMPILER BUGS/LIMITATIONS AFFECTING THIS LIBRARY 51 * - Local arrays are not aligned to a 128-bit boundary, so use of aligned SSE is not 52 * always possible. 53 * - Bugzilla #1125 -- structs in a tuple can't be used in asm. 54 * - Bugzilla #1382 -- CTFE strings never get deleted --> SLOOOOOW compilation. KILLER BUG. 55 * - Bugzilla #1768 -- in CTFE, arrays of arrays aren't initialized properly 56 * 48 57 * HISTORY: 49 58 * 0.1 - Used classes to make expression templates. 50 59 * 0.2 - Support for a wider variety of expressions. Dot product, imaginary numbers, etc. 51 60 * 0.3 - Based on string mixins. Most of the new features of 0.2 are gone, but SSE2 is added. 52 * 0.4 - Added D code generator. Nice error messages. Optimal parameter passing. 61 * 0.4 - Added D code generator. Nice error messages. Optimal parameter passing. 53 62 * (passes pointers, not arrays). 54 63 * 0.5 - Expression simplification step. Slicing support. 55 * 0.6 - Dot product, nested expressions, intrinsics: abs, sqrt, sum. 64 * 0.6 - Dot product, nested expressions (asm only), intrinsics: abs, sqrt, sum. 65 * 0.7 - Intrinsics: prod 56 66 */ 57 67 … … 73 83 // FOR MIXIN: Generate code to evaluate the given vector expression. 74 84 char [] vectorize(char [] expr) 75 { 85 { 76 86 debug (BladeFrontEnd) { 77 87 return `pragma(msg, \n ~ "// " __FILE__ ~ "(" ~__LINE__.stringof[0..$-1] ~ ") ` ~ enquote(expr) ~ `" ~ \n ~ ` ~ mixin_tupleAndSyntaxtreeof("makeVectorCode", expr) ~ "~\\n);" … … 82 92 } 83 93 84 // Simplify the expression, categorise it, 94 // Simplify the expression, categorise it, 85 95 // and dispatch to the appropriate code generator. 86 96 char [] makeVectorCode(Types...)(AbstractSyntaxTree tree) … … 89 99 if (revised.errorMessage.length>0) return `static assert(0, "BLADE: ` ~ enquote(revised.errorMessage) ~ `");`; 90 100 VecExpressionType exprType = categorizeExpression(revised); 91 InvocationCode q; 101 InvocationCode q; 92 102 if (exprType == VecExpressionType.SSE2Expression || exprType == VecExpressionType.SSE1Expression) { 93 103 q = invokeSSE((exprType == VecExpressionType.SSE2Expression), revised); … … 105 115 { 106 116 // TODO: 107 return expression; 117 return expression; 108 118 } 109 119 110 120 template X87RetType(char [] expr) { 111 static if (expr[0]!='0' ) alias void X87RetType;121 static if (expr[0]!='0' && expr[0]!='1') alias void X87RetType; 112 122 else alias real X87RetType; 113 123 } … … 130 140 131 141 /** Function to implement BLAS1 operations using X87 assembler. 132 * Every member of the Values tuple must only be real, 142 * Every member of the Values tuple must only be real, 133 143 * float[], double [], or real[], or BladeStrided!(float), !(double), !(real) 134 144 */ 135 145 X87RetType!(expr) X87VECGEN(char [] expr, int numStrides, Values...)(int veclength, Values values) { 136 debug(BladeBackEnd) { 146 debug(BladeBackEnd) { 137 147 pragma(msg, generateCodeForAsmX87!(numStrides, Values)(expr)); 138 148 } … … 146 156 static ulong[2] SSE_SIGNBITpd = [0x8000_0000_0000_0000L, 0x8000_0000_0000_0000L]; 147 157 static uint[4] SSE_SIGNBITps = [0x8000_0000,0x8000_0000,0x8000_0000, 0x8000_0000]; 158 // The value 1.0 for a parallel SSE register 159 static ulong[2] SSE_ONEpd = [0x3FF0_0000_0000_0000L, 0x3FF0_0000_0000_0000L]; 160 static uint[4] SSE_ONEps = [0x3F0_000, 0x3F0_000, 0x3F0_000, 0x3F0_000]; 148 161 149 162 private: … … 156 169 // SSE1 is possible only if all vectors are floats. 157 170 // X87 is possible for any mix of real, double, and float vectors. 158 // BUG: for X87, should also check number of temporaries (don't overflow the FPU stack 171 // BUG: for X87, should also check number of temporaries (don't overflow the FPU stack) 159 172 enum VecExpressionType { SSE1Expression, SSE2Expression, X87Expression, DExpression }; 160 173 … … 164 177 bool SSE1 = true; 165 178 bool X87 = true; 166 bool strided = false; // true if any strided vector or matrix operations exist 179 bool strided = false; // true if any strided vector or matrix operations exist 167 180 version (D_InlineAsm_X86) {} else { 168 181 // Without an assembler, there's no chance! … … 199 212 y = tree.compounds[x-tree.symbolTable.length][0]-'A'; 200 213 // Check for a stride.. 201 if (tree.compounds[x-tree.symbolTable.length][$-1]==']') { 214 if (tree.compounds[x-tree.symbolTable.length][$-1]==']') { 202 215 strided |= isStrided(tree.compounds[x-tree.symbolTable.length]); 203 216 } 204 217 } 205 218 206 219 char [] t = tree.symbolTable[y].element; 207 220 if (t == "double") { … … 218 231 } 219 232 } 220 // It's not worth doing strided operations with SSE. 233 // It's not worth doing strided operations with SSE. 221 234 if (strided) { SSE1=false; SSE2=false; } 222 235 if (numRealScalars > MAX_87_REALSCALARSPLUSTEMPORARIES) X87 = false; … … 224 237 if (numvectors > MAX_SSE_VECTORS) { SSE1=false; SSE2=false; } 225 238 if (SSE1) return VecExpressionType.SSE1Expression; 226 if (SSE2) return VecExpressionType.SSE2Expression; 227 return X87 ? VecExpressionType.X87Expression : VecExpressionType.DExpression; 239 if (SSE2) return VecExpressionType.SSE2Expression; 240 return X87 ? VecExpressionType.X87Expression : VecExpressionType.DExpression; 228 241 } 229 242 … … 256 269 char [] stridelist=""; 257 270 char [] alltypes=""; 258 271 259 272 char [][] typelist; 260 273 261 274 char [] vals; 262 275 int numstrides=0; … … 283 296 } else { // for arrays, the type is the type of the original array 284 297 t = tree.symbolTable[tree.compounds[x-tree.symbolTable.length][0]-'A'].element; 285 // Check for a stride.. 298 // Check for a stride.. 286 299 if (tree.compounds[x-tree.symbolTable.length][$-1]==']') { 287 300 strided = isStrided(tree.compounds[x-tree.symbolTable.length]); 288 301 if (strided) ++numstrides; 289 302 } 290 303 291 304 } 292 305 } … … 296 309 // long, ulong, and real must become real. 297 310 // We convert everything else to double, since that uses less 298 // FPU stack space. 311 // FPU stack space. 299 312 if (t == "real" || t == "double" || t == "float") alltypes ~= t; 300 313 else if (t == "long" || t == "ulong") result ~= "real"; … … 310 323 alltypes ~= t ~ "*"; 311 324 // for vectors, we only need the pointer, not the length 312 vals ~= "&" ~ v ~ "[0]"; 325 //vals ~= "&" ~ v ~ "[0]"; 326 vals ~= v ~ ".ptr"; 313 327 } 314 328 } … … 323 337 result ~= ")("; 324 338 int firstVector = findVectorForLength(tree); 325 return InvocationCode(result ~ getValueForSymbol(tree.mapping[firstVector], tree).invoker ~ ".length" 339 return InvocationCode(result ~ getValueForSymbol(tree.mapping[firstVector], tree).invoker ~ ".length" 326 340 ~ vals ~ stridelist ~ ")", assertions); 327 341 } … … 342 356 char [] postfix = makePostfixForSSE(tree.expression, tree.rank); 343 357 char [] retType = "void"; 344 if (postfix[0]=='0' ) retType = (SSE2? "double" : "float");358 if (postfix[0]=='0' || postfix[0]=='1') retType = (SSE2? "double" : "float"); 345 359 346 360 char [] result = "SSEVECGEN!(" ~ retType ~ `,"` ~ enquote(postfix) ~ `"`; … … 352 366 else result ~= SSE2? ",double*" : ",float*"; 353 367 vals ~= ","; 354 if (rnk=='1') vals ~= "&";368 // if (rnk=='1') vals ~= "&"; 355 369 InvocationCode q = getValueForSymbol(tree.mapping[i], tree); 356 370 vals ~= q.invoker; 357 371 assertions ~= q.assertions; 358 372 // for vectors, we only need the pointer, not the length 359 if (rnk=='1') vals ~= " [0]";360 } 361 373 if (rnk=='1') vals ~= ".ptr"; 374 } 375 362 376 result ~= ")("; 363 377 int firstVector = findVectorForLength(tree); … … 384 398 // result ~= "static "; 385 399 // } 386 result ~= "assert(" 400 result ~= "assert(" 387 401 ~ getDimensionLengthForSymbol(tree.mapping[i], tree, 1) 388 402 ~ "==" ~ getDimensionLengthForSymbol(tree.mapping[firstVector], tree, 1) … … 399 413 for (int i=0; i<tree.mapping.length;++i) { 400 414 if (tree.rank[i]=='1'){ 401 result ~= "assert( (cast(size_t)( &" ~ getValueForSymbol(tree.mapping[i], tree).invoker402 ~ " [0])& 0x0F) == 0, `SSE Vector misalignment: " ~ getValueForSymbol(tree.mapping[i], tree).invoker ~ "`);"\n;415 result ~= "assert( (cast(size_t)(" ~ getValueForSymbol(tree.mapping[i], tree).invoker 416 ~ ".ptr)& 0x0F) == 0, `SSE Vector misalignment: " ~ getValueForSymbol(tree.mapping[i], tree).invoker ~ "`);"\n; 403 417 } 404 418 } … … 438 452 } 439 453 } 440 return dynamic>=0? dynamic : strided; 454 return dynamic>=0? dynamic : strided; 441 455 } 442 456 … … 458 472 } else { // else it's a compound or an indexed array 459 473 char [] comp = tree.compounds[c-'A'-tree.symbolTable.length]; 460 474 461 475 if (comp[$-1]!=']') { // simple compound expression 462 476 foreach(d; comp) { … … 477 491 char [] nextIndex; 478 492 char [] sliceTo; 479 480 for (int k = comp.length-1;k>=1; --k) { 493 494 for (int k = comp.length-1;k>=1; --k) { 481 495 char d = comp[k]; 482 496 if (d == ']') { ++numbracks; } 483 497 if (d == '[') { --numbracks; } 484 498 485 499 if (d == ']' && numbracks == 1) { nextIndex = ""; } 486 500 else if (numbracks == 1 && comp[k-1..k+1]=="..") { 487 501 isSlice = true; 488 502 sliceTo = nextIndex; 489 nextIndex = ""; 503 nextIndex = ""; 490 504 --k; 491 505 } else if ((d == '[' && numbracks==0) || (d==',' && numbracks==1)) { … … 539 553 } 540 554 RevisedExpression revised = remapCompounds(expr, ranks, symbolTable); 541 555 542 556 VecExpressionType exprType = categorizeExpression(revised); 543 557 if (exprType == VecExpressionType.SSE2Expression || exprType == VecExpressionType.SSE1Expression) { … … 560 574 if (c-'A'<tree.symbolTable.length) { 561 575 v = tree.symbolTable[c-'A'].value; 562 } else { // else it's a compound or an indexed array 576 } else { // else it's a compound or an indexed array 563 577 char [] comp = tree.compounds[c-'A'-tree.symbolTable.length]; 564 578 565 579 if (comp[$-1]!=']') { // compound expression (not an indexed array) 566 580 if (comp[0]>='a' && comp[0]<='z') { … … 590 604 bool isSlice = false; 591 605 char [] newSlice; 592 593 for (int k = comp.length-1;k>=1; --k) { 606 607 for (int k = comp.length-1;k>=1; --k) { 594 608 char d = comp[k]; 595 609 if (d == ']') { ++numbracks; } 596 610 if (d == '[') { --numbracks; } 597 611 598 612 if (d==']' && numbracks == 1) { newSlice = ""; } 599 else if (d=='.' && numbracks == 1) { isSlice = true; 600 if(numSlicesRemaining>0){ newSlice = ""; } 613 else if (d=='.' && numbracks == 1) { isSlice = true; 614 if(numSlicesRemaining>0){ newSlice = ""; } 601 615 else newSlice = "." ~ newSlice; 602 616 } … … 645 659 char [] assertions=""; 646 660 int wholerank = exprRank(tree.expression, tree.rank); 647 if (wholerank ==1) { 661 if (wholerank ==1) { 648 662 int lenvec = findVectorForLength(tree); 649 result = "for (int blade_index=0; blade_index<" 663 result = "for (int blade_index=0; blade_index<" 650 664 ~ getDimensionLengthForSymbol(tree.mapping[lenvec], tree, 1) 651 665 ~ "; ++blade_index) {"\n; 652 666 } 653 667 foreach (c; tree.expression) { 654 if (c>='A' && c<'Z') { 668 if (c>='A' && c<'Z') { 655 669 // restore all symbols into the expression 656 670 // If it's a vector, index it … … 667 681 } else result ~= c; 668 682 } 669 if (wholerank==0) return InvocationCode(result, assertions); 683 if (wholerank==0) return InvocationCode(result, assertions); 670 684 return InvocationCode(result ~ "; }", assertions); 671 685 } trunk/blade/BladeDemo.d
r172 r187 1 1 // Written in the D programming language 1.0. 2 2 /** 3 * BLADE 0.3Alpha -- Basic Linear Algebra D Expressions3 * BLADE Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 5 */ … … 9 9 import std.stdio; 10 10 11 // Local arrays in D aren't aligned to 128-bit boundaries.11 // Local arrays in D aren't currently aligned to 128-bit boundaries. 12 12 // In such cases, the library generates an 'SSE misalignment' assert error, 13 13 // to avoid segfaults. 14 14 // Use heap-allocated arrays, or static arrays (DMD 1.023 or later) 15 15 // cdouble[] always remains aligned, even when sliced. 16 16 17 17 void main() 18 { 18 { 19 19 static z = [3.4, 565, 31.3, 0]; 20 20 double [] a = new double[4]; … … 29 29 r[i]= q[i]*2213.3L; 30 30 } 31 double [4][] another = [[33.1, 4543, 43, 878.7], 31 double [4][] another = [[33.1, 4543, 43, 878.7], 32 32 [5.14, 455, 554, 2.43]]; 33 33 real k=3.4; 34 35 34 mixin(vectorize(` a += (d[2..$-1]*2.01*a[2]-another[][1])["abc".length-3..$]`)); 36 mixin(vectorize(" a-= 2.01*( 3.04+k)*r")); 37 35 mixin(vectorize(" a-= 2.01*( 3.04+k)*r")); 36 38 37 mixin(vectorize("q+= q*2.01")); 39 38 mixin(vectorize("another[0]+=r-=another[0]+another[0]")); 40 39 41 40 // All of the next four are equivalent 42 41 mixin(vectorize("a+=6*another[1,0..$]")); 43 42 mixin(vectorize("a+=6*(another[1,0..$]+another[1,0..$])")); 44 43 45 44 46 45 mixin(vectorize("a+=6*another[1][0..$]")); … … 49 48 // I don't think I'll support this syntax long-term. 50 49 mixin(vectorize("a+=6*another[1,[0,$]]")); 51 50 52 51 // Strided vector 53 52 mixin(vectorize("another[0..$,1]=6*a[0..2]")); … … 61 60 mixin(vectorize("a = -a")); 62 61 mixin(vectorize("u = sum(sqrt(abs(p))) + sum(sqrt(abs(q)))")); 62 mixin(vectorize("u = prod(q)")); 63 writefln("a=", a); 63 64 64 writefln("a=", a);65 65 } trunk/blade/BladeRank.d
r172 r187 27 27 else if (s[i]==')') --paren; 28 28 if (paren==0 && s[i]==']') { 29 if (startIndex && hasSliced) return true; 29 if (startIndex && hasSliced) return true; 30 30 numbrack--; 31 if (s[i-1]=='[') { startIndex=false; } 31 if (s[i-1]=='[') { startIndex=false; } 32 32 } 33 33 if (paren==0 && s[i]=='[') { … … 36 36 numbrack++; 37 37 } 38 if (paren==0 && numbrack==1 && s[i]==',') { 38 if (paren==0 && numbrack==1 && s[i]==',') { 39 39 if (hasSliced && startIndex) return true; 40 40 if (maybeSlice) hasSliced = true; … … 83 83 enum RankError : int { 84 84 UnsupportedOperation = -1, 85 RankIncrement = -2, 85 RankIncrement = -2, 86 86 AttemptToIndexAScalar = -3, 87 87 NonScalarIndex = -4, 88 88 NonScalarSlice = -5, 89 89 DotDotExpected = -6, 90 CommaExpected = -7, 90 CommaExpected = -7, 91 91 RankMismatch = -8, 92 92 RankMismatchConcatenation = -9, … … 132 132 auto rrank = doVisit(this_, args[1]); 133 133 if (rrank<0) return rrank; // propagate errors 134 if (lrank!=1 || rrank!=1) return RankError.RankMismatchDotProduct; 134 if (lrank!=1 || rrank!=1) return RankError.RankMismatchDotProduct; 135 135 return 0; 136 136 case "sum": 137 case "prod": 137 138 auto lrank = doVisit(this_,args[0]); 138 139 if (lrank<0) return lrank; // propagate errors … … 146 147 assert(0, "BLADE ICE: Unsupported function:" ~ func); 147 148 return 0; 148 } 149 } 149 150 } 150 151 ReturnType onVisitPrefix(This this_, char [] op, char [] expr) { … … 159 160 return RankError.RankIncrement; 160 161 } 161 // Includes multi-dimensional slicing and indexing. 162 // Includes multi-dimensional slicing and indexing. 162 163 ReturnType onVisitIndex(This this_, char [] base, char [][2][] slices) { 163 164 int totrank = doVisit(this_, base); 164 165 for(int i=0; i<slices.length; ++i) { 165 166 int r = doVisit(this_,slices[i][0]); 166 if (r!=0) return (r<0)? r :RankError.NonScalarIndex; 167 if (r!=0) return (r<0)? r :RankError.NonScalarIndex; 167 168 if (slices[i][1]==""){ 168 169 --totrank; 169 170 } else { 170 171 r = doVisit(this_,slices[i][1]); 171 if (r!=0) return (r<0)?r:RankError.NonScalarSlice; 172 if (r!=0) return (r<0)?r:RankError.NonScalarSlice; 172 173 } 173 174 } … … 186 187 } 187 188 if (op=="~") { // concatentating scalars and vectors, or vectors and matrices, is permitted 188 if (lrank==rrank || lrank==(rrank+1) || rrank==(lrank+1)) 189 if (lrank==rrank || lrank==(rrank+1) || rrank==(lrank+1)) 189 190 return (lrank>rrank)? lrank: rrank; 190 191 else return RankError.RankMismatchConcatenation; … … 211 212 } 212 213 213 unittest { 214 unittest { 214 215 assert(exprRank("(A[B..C])[C]", "300")==2); 215 216 assert(exprRank("A+=(A[B..C])", "300")==3); 216 217 assert(exprRank("A+(B*C)", "000")==0); 217 218 assert(exprRank("A+(B*C)", "000")==0); 218 219 assert(exprRank("A=(B*C)", "202")==2); 219 220 assert(exprRank("B*=(C*A)", "010")==1); … … 221 222 assert(exprRank("D+=((A+C)*B)", "2022")==2); 222 223 assert(exprRank("D+=((A&C)*B)", "0101")==1); 223 224 224 225 assert(exprRank("C~=(((A[B])[B])~C)", "302")==2); 225 226 assert(exprRank("((D[E])[E])+(-((C[B])[B..E]))", "202300")==1); 226 227 227 228 assert(exprRank("A+((((++B)+D)--)*C)", "1010")==1); 228 229 229 230 assert(exprRank("C+=(A[B])", "302")==2); 230 231 assert(exprRank("dot(A)", "1")==RankError.CommaExpected); 231 232 assert(exprRank("dot(A,B)", "10")==RankError.RankMismatchDotProduct); 232 233 assert(exprRank("dot(B,(A*(dot(B,B))))", "11")==0); 234 233 234 assert(exprRank("dot(B,(A*(dot(B,B))))", "11")==0); 235 236 assert(exprRank("prod(A*B)", "10")==0); 237 235 238 assert(exprRank("A[B,B,B]", "60")==3); 236 239 assert(exprRank("A[B,B,C,B]", "600")==2); 237 240 assert(exprRank("A+=(B[C..$])", "110")==1); 238 241 assert(exprRank("A+=(B[C,D..$])", "2300")==2); 239 242 240 243 // bug fixes: 241 244 assert(exprRank("(A[B..$,C])+=D", "2001")==1); trunk/blade/BladeSimplify.d
r175 r187 14 14 * be moved to every vector inside A. 15 15 * - Use associativity of *: A*(B*C[]) == (A*B)*C[] (Not strictly true for 16 * floating point; results may differ by 1ulp, 16 * floating point; results may differ by 1ulp, 17 17 * eg (1.3L*3.1L)*4.7L < 1.3L*(3.1L*4.7L) 18 18 * Note that floating point addition is not associative at all). 19 19 * - Remove unary minus where possible, eg A-(-B) => A+B, abs(-A) => abs(A). 20 * - Use associativity of * in intrinsics: 20 * - Use associativity of * in intrinsics: 21 21 * sum(A*V) => A*sum(V), abs(A*B) => abs(A)*abs(B) 22 * (D) Expression standardisation 22 * (D) Expression standardisation 23 23 * - Move multiplies to left: Convert A[]*B into B*A[] (assumes * is commutative, 24 24 * not valid for quaternions). … … 54 54 { 55 55 return str=="dot" || str=="sum" || str=="max" || str=="min" 56 || str=="abs" || str=="sqrt" ;56 || str=="abs" || str=="sqrt" || str=="prod"; 57 57 } 58 58 … … 68 68 } 69 69 // Check for undefined symbols 70 if (err.length > 0) 70 if (err.length > 0) 71 71 return RevisedExpression(tree.expression, "", tree.symbolTable, [""], "","", "Undefined symbols:" ~ err); 72 72 else { … … 119 119 } else e~=c; 120 120 } 121 return e; 121 return e; 122 122 } 123 123 … … 171 171 } 172 172 --k; 173 char [] newexpr = expr[i+1..k]; // strip off the {} 173 char [] newexpr = expr[i+1..k]; // strip off the {} 174 174 int newi = k; 175 175 if (i>0 && k<expr.length-1 && expr[i-1]=='(' && expr[k+1]==')') { … … 184 184 ++next; 185 185 comp ~= expr[i+1..k]; // strip off the {} 186 if (expr[k-1]==']') { 186 if (expr[k-1]==']') { 187 187 // it's a vector/matrix of some kind, with rank reduced 188 188 // by indices. Can't just use exprRank, because the [] … … 192 192 // it's a scalar expression. Note that it could involve 193 193 // a vector expression. 194 r~='0'; 195 } 194 r~='0'; 195 } 196 196 } else e ~= cast(char)('A'+z+rank.length); 197 197 i = newi; … … 202 202 } 203 203 // Create a mapping from old to new variable names 204 204 205 205 char [] old_ranks = ""; 206 206 char [] mapping=""; … … 235 235 } 236 236 237 unittest { 237 unittest { 238 238 RevisedExpression e = simplifyVectorExpression("A+=(((D[B])*C)[B])", "2004",[]); 239 239 assert(e.rank=="202"); … … 281 281 assert(sym!="$" && this_.rank[sym[0]-'A']>'0', "Rank error " ~ sym); 282 282 // Note: Later, we'll want this to be a new terminal. 283 return sym ~ createMultiSlice(this_.slicing); 283 return sym ~ createMultiSlice(this_.slicing); 284 284 } 285 285 } … … 301 301 return wrapInParens(doVisit(this_, expr)) ~ op; 302 302 } 303 // Includes multi-dimensional slicing and indexing. 303 // Includes multi-dimensional slicing and indexing. 304 304 ReturnType onVisitIndex(This this_, char [] base, char [][2][] slices) { 305 305 if (slices.length==0) { // [] -- has no effect. … … 311 311 // with the earliest existing dimension. 312 312 // * If the existing dimension is an index, 313 // it might contain a dollar, which we need to replace. 313 // it might contain a dollar, which we need to replace. 314 314 // * If the existing dimension is a slice, the two slices will combine. 315 315 // … … 331 331 newslice ~= [a ~ "+" ~ c, ""]; 332 332 } 333 if (slices.length>1) { 333 if (slices.length>1) { 334 334 // append other slices, if any. 335 335 return doVisit(IndexFoldingVisitor(this_.rank, "$", slices[0..$-1] ~ newslice ~ this_.slicing[1..$]), base); … … 360 360 assert(lrank>0 && rrank>0 && lrank<=2 && rrank<=2, "BLADE ICE: Tensor*tensor is unsupported"); 361 361 bool isDotProduct = false; // was it reduced to a dot product? 362 362 363 363 // In the case of chained matrix multiplies, we can end up with an empty slice. 364 364 if (this_.slicing.length>0 && this_.slicing[$-1][0]=="") { … … 368 368 // First dimension applies to rows of the left operand 369 369 // If it's a slice, it will be a strided slice -- unless 370 // it comes from another matrix multiply, in which case the 370 // it comes from another matrix multiply, in which case the 371 371 // stride will drop out. (A[x]*B is strided). 372 372 char [][2][] newslice=[]; … … 390 390 second = wrapInParens(doVisit(IndexFoldingVisitor(this_.rank, this_.dollar,[]), right)); 391 391 } 392 } else if (lrank==1 && rrank==2) { 392 } else if (lrank==1 && rrank==2) { 393 393 // vector * matrix, Matrix uses all the slicing 394 394 second = wrapInParens(doVisit(this_, right)); … … 403 403 } 404 404 } 405 } else { 406 // in DMD1.024, nasty compiler bug if you save the first & second results into a local variable 405 } else { // not a multiplication 407 406 return wrapInParens(doVisit(this_, left)) ~ op ~ wrapInParens(doVisit(this_, right)); 408 407 } … … 416 415 } 417 416 418 unittest { 417 unittest { 419 418 assert(foldIndices("((A[C..D])+B)[($-E)]", "21000")=="(A[C+((D-C)-E)])+(B[($-E)])"); 420 419 assert(foldIndices("(A[C])[D]", "3100")=="A[C,D]"); … … 427 426 assert(foldIndices("A[,B..$,C]", "300")=="A[,B..$,C]"); 428 427 // Multidimensional slicing 429 assert(foldIndices("(C*((A*B)[C]))[D]", "2200")=="C*dot((A[C,]),(B[D]))"); 428 assert(foldIndices("(C*((A*B)[C]))[D]", "2200")=="C*dot((A[C,]),(B[D]))"); 430 429 assert(foldIndices("(A*B)[C..D,D]", "2200")=="(A[C..D,])*(B[D])"); 431 430 assert(foldIndices("(A*B)[C..D]", "2200")=="(A[C..D,])*B"); … … 433 432 assert(foldIndices("(A*B)[C..D]", "1200")=="A*(B[C..D])"); 434 433 assert(foldIndices("(A*B)[C]", "120")=="dot(A,(B[C]))"); 435 434 436 435 assert(foldIndices("((A*B)*C)[D]", "2220")=="((A[D,])*B)*C"); 437 436 assert(foldIndices("((A+B)*C)[D]", "2220")=="((A[D,])+(B[D,]))*C"); 438 437 assert(foldIndices("((D*A)*B)[C]", "2100")=="dot((D*(A[C,])),B)"); 439 assert(foldIndices("(((A*B)*C)[D..E])[D]", "12200")=="dot((A*B),(C[D+D]))"); 438 assert(foldIndices("(((A*B)*C)[D..E])[D]", "12200")=="dot((A*B),(C[D+D]))"); 440 439 assert(foldIndices("A+=(((D[B])*C)[B])", "2004")=="A+=((D[B,B])*C)"); 441 440 assert(foldIndices("dot(A,(A*dot(A,A)))","1")=="dot(A,(A*dot(A,A)))"); … … 466 465 ScalarFold right = doVisit(this_, args[1]); 467 466 return ScalarFold("", combineMul(combineMul(left.multiplier, right.multiplier), "{" ~ func ~ "(" ~ wrapInParens(left.expr) ~ "," ~ wrapInParens(right.expr) ~ ")}")); 468 case "sum": 467 case "sum": 468 case "prod": 469 469 // sum(A*V) = A*sum(V) is a scalar. 470 // prod(A*V) = A*prod(V) is a scalar. 470 471 return ScalarFold("", combineMul(left.multiplier, "{" ~ func ~ "(" ~ wrapInParens(left.expr) ~ ")}")); 471 472 case "abs": … … 483 484 case "max": 484 485 case "min": // max(A*B) can't be simplified unless we know that they are not negative. 485 return ScalarFold("", "{" ~ func ~ "(" ~ combineMulWithCompound(left.expr, left.multiplier) ~ ")}"); 486 // return ScalarFold("", "{" ~ func ~ "(@>" ~ left.expr ~ "@" ~ left.multiplier ~ "<@)}"); 486 return ScalarFold("", "{" ~ func ~ "(" ~ combineMulWithCompound(left.expr, left.multiplier) ~ ")}"); 487 // return ScalarFold("", "{" ~ func ~ "(@>" ~ left.expr ~ "@" ~ left.multiplier ~ "<@)}"); 487 488 default: 488 489 assert(0, "BLADE: Unsupported function"); 489 490 return ScalarFold("",""); 490 491 } 491 } 492 } 492 493 ReturnType onVisitPrefix(This this_, char [] op, char [] expr) { 493 494 if (op=="-") { … … 498 499 else return ScalarFold(left.expr, "-"); 499 500 } else if (op=="+") { // just ignore unary plus 500 return doVisit(this_, expr); 501 return doVisit(this_, expr); 501 502 } else { 502 503 ScalarFold f = doVisit(this_, expr); … … 530 531 assert(first.multiplier=="" && second.expr=="", "BLADE ICE"); 531 532 assert(second.multiplier!="-", "BLADE ICE"); // this would be a*=-b, where b is a vector 532 if (second.multiplier.length>1) return ScalarFold(wrapInParens(first.expr) ~ op ~ "{" ~ wrapInParens(second.multiplier) ~ "}",""); 533 else return ScalarFold(wrapInParens(first.expr) ~ op ~ wrapInParens(second.multiplier),""); 533 if (second.multiplier.length>1) return ScalarFold(wrapInParens(first.expr) ~ op ~ "{" ~ wrapInParens(second.multiplier) ~ "}",""); 534 else return ScalarFold(wrapInParens(first.expr) ~ op ~ wrapInParens(second.multiplier),""); 534 535 } 535 536 if (op=="*") { … … 588 589 assert(left.length>0); 589 590 if (right.length==0) return left; 590 if (right=="-") return "-" ~ wrapInParens(left); 591 if (right=="-") return "-" ~ wrapInParens(left); 591 592 if (right.length==1) return wrapInParens(left) ~ "*" ~ right; 592 return wrapInParens(left) ~ "*{" ~ wrapInParens(right) ~ "}"; 593 return wrapInParens(left) ~ "*{" ~ wrapInParens(right) ~ "}"; 593 594 } 594 595 trunk/blade/CodegenX86.d
r172 r187 1 1 // Written in the D programming language 1.0 2 2 /** 3 * BLADE 0.4Alpha -- Basic Linear Algebra D Expressions3 * BLADE Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 5 * Generate near-optimal x87/SSE/SSE2 asm code for BLAS1 basic vector operations … … 21 21 * 22 22 * BUGS/ FUTURE DIRECTIONS: 23 * None of these support dot product, ormatrix operations.23 * None of these support matrix operations. 24 24 * X87: 25 25 * - Not optimal for the case of multiple real vectors (they could share a counter). … … 30 30 * (to do this, need naked asm with no stack frame). 31 31 * SSE/SSE2: 32 * - SSE functions don't support unaligned data. Need to generate seperate code 32 * - SSE functions don't support unaligned data. Need to generate seperate code 33 33 * for that case (NOTE: probably only worth doing for small expressions). 34 34 * … … 191 191 // (max # temporaries + max # real scalars) must be <=8, otherwise FPU stack 192 192 // will overflow). 193 const int MAX_87_REALSCALARSPLUSTEMPORARIES = 8; 193 const int MAX_87_REALSCALARSPLUSTEMPORARIES = 8; 194 194 195 195 private: … … 213 213 // indexed by i. 214 214 char [] indexedVector(char [][] typelist, char [] ranklist, char [] stridelist, char var) 215 { 215 { 216 216 if (typelist[var-'A']=="real") return " real ptr [" ~ vectorRegister[vectorNum(ranklist, var)] ~ "]"; 217 217 else if (stridelist[var-'A']=='1') return operandSize(typelist[var-'A']) ~ "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ "]"; … … 264 264 (Pentium, PMMX, PII, PIII). It is also optimal for recent x86 CPUs 265 265 where vector sizes are mixed. 266 266 267 267 There are two cases: 268 268 (A) DAXPY-style loops, where every element is independent of the other indices; … … 271 271 For cumulative loops, best performance is achieved with loop unrolling and 272 272 multiple accumulators, in order to break dependency chains. 273 273 274 274 The key optimisation rules for DAXPY loops are: 275 275 1. keep the loop overhead to one clock cycle if possible. … … 304 304 ranklist~="1"; 305 305 typelist ~= typeof(T[0]).stringof; 306 } else static if (is(typeof(T.data))) { 306 } else static if (is(typeof(T.data))) { 307 307 stridelist~="1"; 308 308 ranklist~="1"; … … 323 323 char [] result=""; 324 324 char [] incrementRealVectors=""; 325 325 326 326 result ~= "// Operation : " ~ operations ~ \n; 327 327 328 328 // Create local variables for pointers to vectors (avoid bug #1125) 329 329 … … 361 361 ~ " add " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 362 362 } 363 result ~= " //" ~ cast(char)('A'+i) ~ \n; 363 result ~= " //" ~ cast(char)('A'+i) ~ \n; 364 364 ++numvecs; 365 365 } else if (typelist[i]=="real") { … … 367 367 ++numconsts; 368 368 ++numScalarsOnStack; 369 result ~= " //" ~ cast(char)('A'+i) ~ \n; 369 result ~= " //" ~ cast(char)('A'+i) ~ \n; 370 370 } 371 371 } … … 376 376 int numOnStack = 0; // How much of the FP stack is being used? 377 377 378 bool isCumulative = (operations[0]=='0' );378 bool isCumulative = (operations[0]=='0' || operations[0]=='1'); 379 379 if (operations[0]=='0') { 380 380 result ~= " fldz;"\n; // dot product 381 381 ++numOnStack; 382 382 done = 1; 383 } else if (operations[0]=='1') { 384 result ~= " fld1;"\n; // prod 385 ++numOnStack; 386 done = 1; 383 387 } 384 388 result ~= " xor EAX, EAX; "\n … … 389 393 // the final storage instruction, because of the FST latency). 390 394 char [] mainbody = ""; 391 395 392 396 while(done<operations.length) { 393 397 char [] next; … … 420 424 } else if (operations[done]==',') { 421 425 mainbody ~= " " ~ opToX87[operations[done+1]] ~ " ST, ST(0); // dup " ~ operations[done+1] ~ \n; 422 done+=2; 426 done+=2; 423 427 } else if (ranklist[operations[done]-'A']=='1') { 424 428 // An operation will be performed between the stack top and a vector. … … 430 434 // it chains. 431 435 next = ((done+2 == operations.length)? " fstp " : " fst ") 432 ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment; 436 ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment; 433 437 } else if (typelist[operations[done]-'A']=="real") { 434 438 // 80-bit vectors must be loaded onto the FPU stack first … … 445 449 // Multiply by real scalar, which is already on the stack. 446 450 next = " fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - realScalarNum(typelist, ranklist, operations[done]-'A')-1) ~ "); // * " ~ operations[done] ~ \n; 447 mainbody ~= next; 451 mainbody ~= next; 448 452 } else { 449 453 // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. … … 452 456 } 453 457 done +=2; 454 } 455 } 456 457 result ~= \n 458 ~ " align 4;\n" 458 } 459 } 460 461 result ~= \n 462 ~ " align 4;\n" 459 463 ~ "L1:\n" ~ mainbody; 460 464 461 465 // if (cumulatingOp) result ~= " " ~ opToX87[cumulatingOp] ~ "p ST(2), ST;"\n; 462 466 … … 472 476 473 477 result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 474
