Changeset 168
- Timestamp:
- 01/04/08 07:56:21 (8 months ago)
- Files:
-
- trunk/blade/Blade.d (modified) (7 diffs)
- trunk/blade/BladeDemo.d (modified) (3 diffs)
- trunk/blade/BladeSimplify.d (modified) (1 diff)
- trunk/blade/CodegenX86.d (modified) (12 diffs)
- trunk/blade/PostfixX86.d (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/blade/Blade.d
r167 r168 23 23 * - A 'const folding' (actually vector/scalar folding) step is performed. 24 24 * 25 * SPEED/ACCURACY TRADEOFF: 26 * IEEE floating point multiplication and addition are not associative. 27 * Assuming that overflow and underflow do not occur: 28 * (a*b)*c may differ from a*(b*c) in the last bit. 29 * (a+b)+c may differ from a+(b+c) by a factor a million or more. 30 * 31 * - Multiplication is assumed to be associative. 32 * - Addition and subtraction are not treated as associative. 33 * Except: Addition inside a dot product or vector sum is treated as associative. 34 * 35 * BUGS: 36 * Need to create asserts for nested expressions as well as for the primary one. 37 * 25 38 * FUTURE DIRECTIONS (in order of expected implementation): 26 * - Dot product (which was present in BLADE 0.2). 39 * - sum(), trace() 40 * - Loop unrolling for cumulative operations dot, sum, trace. 27 41 * - Dense matrix support. 28 42 * - Triangular, banded, symmetric, and sparse matrix support … … 39 53 * 0.2 - Support for a wider variety of expressions. Dot product, imaginary numbers, etc. 40 54 * 0.3 - Based on string mixins. Most of the new features of 0.2 are gone, but SSE2 is added. 41 * 0.4 - Added D code generator. Nice error messages. Optimal parameter passing 55 * 0.4 - Added D code generator. Nice error messages. Optimal parameter passing. 42 56 * (passes pointers, not arrays). 43 * 0.5 - Expression simplification step. 57 * 0.5 - Expression simplification step. Slicing support. 58 * 0.6 - Dot product, nested expressions. 44 59 */ 45 60 … … 55 70 private import blade.BladeVisitor: expressionContainsAssignment; 56 71 57 private import blade.PostfixX86 : makePostfixForX87 ;72 private import blade.PostfixX86 : makePostfixForX87, makePostfixForSSE; 58 73 59 74 public: … … 98 113 } 99 114 115 template SSERetType(int SSEVersion, char [] expr) { 116 static if (expr[0]!='0') alias void SSERetType; 117 else static if (SSEVersion==1) alias float SSERetType; 118 else alias double SSERetType; 119 } 120 template X87RetType(char [] expr) { 121 static if (expr[0]!='0') alias void X87RetType; 122 else alias real X87RetType; 123 } 124 100 125 // These functions have the complete expression encoded in the template type. 101 126 // One of these functions is instantiated for each expression. … … 107 132 * Every member of the Values tuple must only be double or double *. 108 133 */ 109 voidSSEVECGEN(int SSEVersion, char [] expr, Values...)(int veclength, Values values) {134 SSERetType!(SSEVersion, expr) SSEVECGEN(int SSEVersion, char [] expr, Values...)(int veclength, Values values) { 110 135 debug(BladeBackEnd) { 111 136 pragma(msg, generateCodeForSSE!(Values)(SSEVersion, expr)); … … 117 142 * Every member of the Values tuple must only be real, float[], double [], or real[]. 118 143 */ 119 voidX87VECGEN(char [] expr, int numStrides, Values...)(int veclength, Values values) {144 X87RetType!(expr) X87VECGEN(char [] expr, int numStrides, Values...)(int veclength, Values values) { 120 145 debug(BladeBackEnd) { 121 146 pragma(msg, generateCodeForAsmX87!(numStrides, Values)(expr)); … … 307 332 char [] invokeSSE(bool SSE2, RevisedExpression tree) 308 333 { 309 char [] result = "SSEVECGEN!(" ~ (SSE2?"2":"1") ~ `,"` ~ enquote(tree.expression) ~ `"`;334 char [] result = "SSEVECGEN!(" ~ (SSE2?"2":"1") ~ `,"` ~ enquote(makePostfixForSSE(tree.expression, tree.rank)) ~ `"`; 310 335 // For SSE2, everything must be implicitly convertible to double. 311 336 char [] vals; trunk/blade/BladeDemo.d
r167 r168 14 14 // Use heap-allocated arrays, or static arrays (DMD 1.023 or later) 15 15 // cdouble[] always remains aligned, even when sliced. 16 17 float dot_product(float[] a, float[] b)18 {19 return 0;20 }21 16 22 17 void main() … … 30 25 q[0..$]= [17.0f, 28.25, 1, 0]; 31 26 float [4] r; 32 idouble [] p = [2.3i, 254i, 0.1i, 1.2i];27 real [] p = [2.3, 254, 0.1, 1.2]; 33 28 for(int i=0; i<r.length;++i) { 34 29 r[i]= q[i]*2213.3L; … … 57 52 mixin(vectorize("another[0..$,1]=6*a[0..2]")); 58 53 59 // Parses, and simplifies to A*A, where A = dot(q,q). No asm codegen yet. 54 // Simplifies to q*= 2*dot(q,q)*dot(q*q). 55 mixin(vectorize("q *=dot(q,q*dot(2*q,q))")); 60 56 double u; 61 //mixin(vectorize("u = dot(q,q*dot(q,q))"));62 // mixin(vectorize("q *=dot(q,q*dot(q+q,q))"));57 mixin(vectorize("u = dot(q,q*dot(q,q))")); 58 mixin(vectorize("u = dot(a, q)")); 63 59 64 60 writefln("a=", a); trunk/blade/BladeSimplify.d
r167 r168 455 455 ScalarFold left = doVisit(this_,args[0]); 456 456 ScalarFold right = doVisit(this_, args[1]); 457 return ScalarFold("", combineMul(combineMul(left.multiplier, right.multiplier), "{" ~ func ~ "(" ~ left.expr ~ "," ~ right.expr~ ")}"));457 return ScalarFold("", combineMul(combineMul(left.multiplier, right.multiplier), "{" ~ func ~ "(" ~ wrapInParens(left.expr) ~ "," ~ wrapInParens(right.expr) ~ ")}")); 458 458 } else { 459 459 assert(0, "BLADE: Unsupported function"); trunk/blade/CodegenX86.d
r167 r168 297 297 static if (is(typeof(T[0]))) { 298 298 stridelist~="0"; 299 ranklist~="1"; 299 ranklist~="1"; 300 300 typelist ~= typeof(T[0]).stringof; 301 301 } else static if (is(typeof(T.data))) { … … 314 314 private: 315 315 // This is split off from the template to make code coverage easier. 316 char [] generateCodeForAsmX87Impl(char [] ranklist, char [][] typelist, char [] stridelist, char [] operations , char cumulatingOp=0)316 char [] generateCodeForAsmX87Impl(char [] ranklist, char [][] typelist, char [] stridelist, char [] operations) 317 317 { 318 318 char [] result=""; … … 365 365 } 366 366 } 367 if (cumulatingOp=='+') { 367 int done=0; 368 369 // We need to keep track of how many things are on the FPU stack. 370 // Every time something is pushed, the indices of our variables change! 371 int numOnStack = 0; // How much of the FP stack is being used? 372 373 bool isDotProduct = (operations[0]=='0'); 374 if (operations[0]=='0') { 368 375 result ~= " fldz;"\n; // dot product 369 } else if (cumulatingOp=='*') { // trace370 result ~= "fld1;"\n;376 ++numOnStack; 377 done = 1; 371 378 } 372 379 result ~= " xor EAX, EAX; "\n 373 380 " sub EAX, veclength; // counter=-length"\n 374 381 " jz short L3; // test for length==0"\n; 375 int done=0;376 382 377 383 // Construct the main body of the loop (the main body does not include 378 384 // the final storage instruction, because of the FST latency). 379 385 char [] mainbody = ""; 380 381 // We need to keep track of how many things are on the FPU stack. 382 // Every time something is pushed, the indices of our variables change! 383 int numOnStack = 0; // How much of the FP stack is being used? 384 386 385 387 while(done<operations.length) { 386 388 char [] next; … … 443 445 ~ "L1:\n" ~ mainbody; 444 446 445 if (cumulatingOp) result ~= " " ~ opToX87[cumulatingOp] ~ "p ST(2), ST;"\n;447 // if (cumulatingOp) result ~= " " ~ opToX87[cumulatingOp] ~ "p ST(2), ST;"\n; 446 448 447 449 result ~= incrementRealVectors // Update the counters … … 449 451 450 452 // Discard any scalars that are left on the stack 451 if ( cumulatingOp!=0&& numScalarsOnStack>0) {453 if (isDotProduct && numScalarsOnStack>0) { 452 454 // Preserve the result of the dot product 453 455 result ~= " fxch ST(" ~ itoa(numScalarsOnStack) ~ "), ST;"\n; … … 470 472 * At entry, all vector parameters are aligned. 471 473 */ 472 char [] generateCodeForSSE(Values...)(int SSEVer, char [] infixOperations)474 char [] generateCodeForSSE(Values...)(int SSEVer, char [] operations) 473 475 { 474 476 char [] ranklist; … … 476 478 static if (is(typeof(T[0]))) ranklist~="1"; else ranklist~="0"; 477 479 } 478 return generateCodeForSSEImpl(SSEVer, ranklist, makePostfixForSSE(infixOperations, ranklist)); 480 return generateCodeForSSEImpl(SSEVer, ranklist, operations); 481 // makePostfixForSSE(infixOperations, ranklist)); 479 482 } 480 483 … … 489 492 int numvecs = countVectors(ranklist); 490 493 int numScalarsOnStack=0; 494 bool isDotProduct = (operations[0]=='0'); 495 if (isDotProduct) result ~= ((SSEVer == 2)? " double" : " float") ~" sum;"\n; 491 496 492 497 result~= \n"asm {"\n ~ pushRegisters(numvecs); … … 516 521 } 517 522 } 518 result ~= " xor EAX, EAX; "\n519 " sub EAX, veclength; // counter=-length"\n520 " jz short L2; // test for length==0"\n;521 int done=0;522 523 523 524 char [] mainbody = ""; … … 530 531 531 532 int numOnStack = numScalarsOnStack; // How much of the FP stack is being used? 533 int done=0; 534 if (operations[0]=='0') { 535 result ~= " pxor " ~ XMM(numOnStack) ~ "," ~ XMM(numOnStack) ~ "; // 0\n"; 536 ++numOnStack; 537 ++done; 538 } 539 result ~= " xor EAX, EAX; "\n 540 " sub EAX, veclength; // counter=-length"\n 541 " jz short L2; // test for length==0"\n; 532 542 while(done<operations.length) { 533 543 char [] comment; … … 552 562 mainbody ~= " movap" ~ suffix ~ indexedSSEVector(ranklist, operations[$-2], vectorsize) ~ ", XMM" ~ itoa(numOnStack-1) ~ comment; 553 563 extra ~= " movs" ~ suffix ~ indexedSSENext(ranklist, operations[$-2], vectorsize) ~ ", XMM" ~ itoa(numOnStack-1) ~ comment; 564 } else 565 if (operations[done-1]==operations[done]) { 566 // operation on self, eg XX+ --> don't need to load it again. 567 int cumvector = (operations[done-1]=='0')? numScalarsOnStack : numOnStack-1; 568 mainbody ~= " " ~ opToSSE[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 569 ~ XMM(numOnStack-1) ~ comment; 570 extra ~= " " ~ opToSSESingle[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 571 ~ XMM(numOnStack-1) ~ comment; 554 572 } else { 555 573 mainbody ~= " " ~ opToSSE[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " … … 579 597 result ~= " add EAX,4;\n" ~ " js L1;\n" 580 598 ~ "L2:\n sub EAX, 4;\n jns L4;\n" 581 // Now the extra calculations for the 0-3 float , or 0-1 doubles599 // Now the extra calculations for the 0-3 floats 582 600 ~ "L3:"\n ~ extra 583 601 ~ " add EAX,1;\n js L3;\n"; 584 602 } 585 result~= "L4:" \n ~ popRegisters(numvecs) ~ "}\n"; 603 result ~= "L4:" \n; 604 if (isDotProduct) { 605 // Result is now in XMM(numScalarsOnStack). We need to do a horizontal 606 // add to get the final sum. 607 if (SSEVer==2) { 608 // For SSE3, use haddpd XMM(numScalarsOnStack). 609 result ~= " movhlps " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ";"\n 610 ~ " addsd " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n"; 611 } else { // floats 612 result ~= " movhlps " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ";"\n 613 ~ " addps " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n" 614 ~ " pshufd " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ",1;"\n 615 ~ " addss " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n"; 616 } 617 result ~= " movs" ~ suffix ~ " sum," ~ XMM(numScalarsOnStack) ~ ";"\n; 618 //result ~= "// Move to ST(0)\n"; 619 } 620 result ~= popRegisters(numvecs) ~ "}\n"; 621 if (isDotProduct) result ~= " return sum;"\n; 586 622 587 623 return result; trunk/blade/PostfixX86.d
r166 r168 42 42 } 43 43 ReturnType onVisitFunction(This this_, char [] func, char [][] args) { 44 if (func=="d") { 45 return "0" ~ doVisit(this_,args[0]) ~ doVisit(this_, args[1]) ~ "*+"; 46 } 44 47 assert(0, "BLADE ICE: Unsupported"); 45 48 } … … 114 117 return sym; 115 118 } 116 ReturnType onVisitFunction(This this_, char [] func, char [][] args) { 119 ReturnType onVisitFunction(This this_, char [] func, char [][] args) { 120 if (func=="d") { 121 return "0" ~ doVisit(this_,args[0]) ~ doVisit(this_, args[1]) ~ "*+"; 122 } 117 123 assert(0, "BLADE ICE: Unsupported"); 118 124 }
