Changeset 125
- Timestamp:
- 11/01/07 09:01:25 (10 months ago)
- Files:
-
- trunk/blade/Blade.d (modified) (8 diffs)
- trunk/blade/BladeDemo.d (modified) (1 diff)
- trunk/blade/BladeRank.d (modified) (1 diff)
- trunk/blade/CodegenX86.d (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/blade/Blade.d
r124 r125 1 1 // Written in the D programming language 1.0 2 2 /** 3 * BLADE 0. 3Alpha -- Basic Linear Algebra D Expressions3 * BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 5 * Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. … … 17 17 * - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 18 18 * the expression, and the availability of inline asm. 19 * - 80-bit precision is used whenever possible.20 * - Supports mixed-length operations (eg, real[] + double[] + float[]).21 *22 * TWO STEPS FORWARD, ONE STEP BACK:23 * The previous version supported additional functionality. This has not yet been24 * added after refactoring.25 * - Supports real, complex and imaginary vectors, and detects type mismatches between them.26 19 * - When static arrays are used, mismatches in array length are detected 27 20 * at compile time. 28 * 29 * BUGS/ FUTURE DIRECTIONS:30 * The x87 code generation targets early Pentiums, which are now irrelevant.31 * It needs to be updated to PM/Core2 (this will significantly simplify it).32 * - Not optimal for the case of multiple real vectors (they could share a counter).33 * - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required).34 * - Doesn't take full advantage of length being known at compile time (loop unrolling35 * is possible).36 * - Doesn't use EBP register -- this would allow an extra vector in expressions.37 * (to do this, need naked asm with no stack frame).21 * - Error messages refer to the line of user code which generated the error. 22 * The library never produces a torrent of undecipherable template error messages. 23 * 24 * FUTURE DIRECTIONS (in order of expected implementation): 25 * - SSE1 code could be used for float operations. 26 * - Dot product (which was present in BLADE 0.2). 27 * - The x87 code generation targets early Pentiums, which are now irrelevant. 28 * - A 'const folding' (actually vector/scalar folding) step needs to be performed. 29 * - Support for strided vectors. 30 * - Matrix support. 38 31 * 39 32 * THEORY: … … 48 41 * 0.2 - Support for a wider variety of expressions. Dot product, imaginary numbers, etc. 49 42 * 0.3 - Based on string mixins. Most of the new features of 0.2 are gone, but SSE2 is added. 43 * 0.4 - Added D code generator. Nice error messages. Optimal parameter passing. 50 44 */ 51 45 … … 55 49 private import blade.BladeUtil : wrapInQuotes, startsWith; 56 50 private import blade.BladeRank; 57 public import blade.CodegenX86 : generateCodeForAsmX87, generateCodeForSSE 2, MAX_X87_VECTORS, MAX_SSE_VECTORS;51 public import blade.CodegenX86 : generateCodeForAsmX87, generateCodeForSSE, MAX_X87_VECTORS, MAX_SSE_VECTORS; 58 52 59 53 public: … … 64 58 // we must re-assemble the type information, and use this to generate the asm code. 65 59 66 /** Function to implement BLAS1 operations using SSE 2 assembler.67 * Every member of the Values tuple must only be double or double [].60 /** Function to implement BLAS1 operations using SSE/SSE2 assembler. 61 * Every member of the Values tuple must only be double or double *. 68 62 */ 69 void SSEVECGEN( char [] expr, Values...)(Values values) {70 const ranklist = TupleRank!(Values); 71 pragma(msg, generateCodeForSSE 2(ranklist, expr));72 mixin(generateCodeForSSE 2(ranklist, expr));63 void SSEVECGEN(int SSEVersion, char [] expr, Values...)(int veclength, Values values) { 64 const ranklist = TupleRank!(Values); 65 pragma(msg, generateCodeForSSE(SSEVersion, ranklist, expr)); 66 mixin(generateCodeForSSE(SSEVersion, ranklist, expr)); 73 67 } 74 68 … … 76 70 * Every member of the Values tuple must only be real, float[], double [], or real[]. 77 71 */ 78 void X87VECGEN(char [] rawexpr, Values...)( Values values) {72 void X87VECGEN(char [] rawexpr, Values...)(int veclength, Values values) { 79 73 const typelist = elementTupleToString!(Values); 80 74 const ranklist = TupleRank!(Values); … … 161 155 else result ~= "double"; // Convert all other scalars into doubles. 162 156 } 163 else if (startsWith(t, "real[")) result ~= ",real []";164 else if (startsWith(t, "float[")) result ~= ",float []";165 else if (startsWith(t, "double[")) result~= ",double []";157 else if (startsWith(t, "real[")) result ~= ",real*"; 158 else if (startsWith(t, "float[")) result ~= ",float*"; 159 else if (startsWith(t, "double[")) result~= ",double*"; 166 160 // else error. 167 161 } 168 162 result ~= ")("; 169 int knt=0; 170 for (int i=0; i<tree.symbolTable.length;++i) { 171 if (knt>0) result ~=","; 172 result ~= tree.symbolTable[i].value; 173 ++knt; 163 int firstVector = findVectorForLength(tree); 164 result ~= tree.symbolTable[firstVector].value ~ ".length"; 165 for (int i=0; i<tree.symbolTable.length;++i) { 166 result ~="," ~ tree.symbolTable[i].value; 167 // for vectors, we only need the pointer, not the length 168 if (tree.symbolTable[i].rank==1) result ~= ".ptr"; 174 169 } 175 170 return result~ ");"; … … 181 176 char [] result = assertAllVectorLengthsEqual(tree); 182 177 183 result ~= "SSEVECGEN!( " ~ wrapInQuotes(tree.expression);178 result ~= "SSEVECGEN!(2," ~ wrapInQuotes(tree.expression); 184 179 // For SSE2, everything must be implicitly convertible to double. 185 180 for (int i=0; i<tree.symbolTable.length;++i) { 186 181 if (tree.symbolTable[i].rank==0) result ~= ",double"; 187 else result ~= ",double []";182 else result ~= ",double*"; 188 183 } 189 184 result ~= ")("; 190 int knt=0; 191 for (int i=0; i<tree.symbolTable.length;++i) { 192 if (knt>0) result ~=","; 193 result ~= tree.symbolTable[i].value; 194 ++knt; 185 int firstVector = findVectorForLength(tree); 186 result ~= tree.symbolTable[firstVector].value ~ ".length"; 187 188 for (int i=0; i<tree.symbolTable.length;++i) { 189 result ~= "," ~ tree.symbolTable[i].value; 190 // for vectors, we only need the pointer, not the length 191 if (tree.symbolTable[i].rank==1) result ~= ".ptr"; 195 192 } 196 193 return result ~ ");"; trunk/blade/BladeDemo.d
r123 r125 34 34 35 35 writefln("a=", a); 36 double c = 1.0i * 2.0i; 37 idouble w = cast(idouble)c; 38 pragma(msg, typeof(c).stringof); 36 39 } trunk/blade/BladeRank.d
r119 r125 72 72 // -------------- 73 73 // Ranklist functions 74 75 int findFirstVector(int [] ranklist)76 {77 for (int i=0; i< ranklist.length;++i) {78 if (ranklist[i]==1) return i;79 }80 return 0;81 }82 83 74 84 75 // Count the number of vectors trunk/blade/CodegenX86.d
r123 r125 1 1 // Written in the D programming language 1.0 2 2 /** 3 * BLADE 0. 3Alpha -- Basic Linear Algebra D Expressions3 * BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 5 * Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. … … 17 17 * - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 18 18 * the expression, and the availability of inline asm. 19 * - 80-bit precision is used whenever possible.19 * - If x87 code is generated, 80-bit precision is used whenever possible. 20 20 * - Supports mixed-length operations (eg, real[] + double[] + float[]). 21 21 * 22 * TWO STEPS FORWARD, ONE STEP BACK:23 * The previous version supported additional functionality. This has not yet been24 * added after refactoring.25 * - Supports real, complex and imaginary vectors, and detects type mismatches between them.26 * - When static arrays are used, mismatches in array length are detected27 * at compile time.28 22 * 29 23 * BUGS/ FUTURE DIRECTIONS: … … 43 37 * a string containing x87 asm, which is then mixed into a function which accepts the tuple. 44 38 * 45 * HISTORY:46 * 0.1 - Used classes to make expression templates.47 * 0.2 - Support for a wider variety of expressions. Dot product, imaginary numbers, etc.48 * 0.3 - Based on string mixins. Most of the new features of 0.2 are gone, but SSE2 is added.49 39 */ 50 40 … … 380 370 for (int i=0; i< ranklist.length;++i) { 381 371 if (ranklist[i]==1){ 382 result~= " auto vec" ~ itoa(i) ~ " = values[" ~itoa(i) ~"].ptr; // " ~ cast(char)('A'+i)~ \n;383 372 if (typelist[i]=="real") { 384 373 incrementRealVectors ~= " add " ~ vectorRegister[vecnum] ~ ", " ~ REALSIZE ~ ";\n"; 385 374 } 386 375 ++vecnum; 387 } else result~= " alias values["~itoa(i)~"] val" ~ itoa(i) ~ "; // " ~ cast(char)('A'+i)~ \n; 388 } 389 390 result ~= " int veclength = values[" ~itoa(findFirstVector(ranklist)) ~"].length;\n"; 391 376 } 377 } 378 392 379 int numScalarsOnStack=0; 393 380 … … 403 390 if (ranklist[i]==1) { 404 391 if (typelist[i]=="real") { 405 result ~= " mov " ~ vectorRegister[numvecs] ~ ", v ec" ~ itoa(i) ~ ";";392 result ~= " mov " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 406 393 } else { 407 394 result ~= " lea " ~ vectorRegister[numvecs] 408 395 ~ ", [" ~ vectorSize(typelist[i]) ~ "*EAX]; " 409 ~ " add " ~ vectorRegister[numvecs] ~ ", v ec" ~ itoa(i) ~ ";";396 ~ " add " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 410 397 } 411 398 result ~= " //" ~ cast(char)('A'+i) ~ \n; … … 479 466 } else { 480 467 // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 481 next = " fmul " ~ operandSize(typelist[operations[done]-'A']) ~ "val " ~ itoa(operations[done]-'A') ~"; // * " ~ operations[done..done+1] ~ "\n";468 next = " fmul " ~ operandSize(typelist[operations[done]-'A']) ~ "values[" ~ itoa(operations[done]-'A') ~"]; // * " ~ operations[done..done+1] ~ "\n"; 482 469 mainbody ~= next; firstbody ~= next; 483 470 } … … 522 509 public: 523 510 524 // We don't need types for SSE2, everything is a double. 525 526 char [] generateCodeForSSE2(int [] ranklist, char [] infixOperations, char cumulatingOp=0) 511 /** Generate BLAS1 asm code which is optimal for CPUs with SSE2. 512 * We don't need types for SSE2. Everything is a double; vectors are double* 513 */ 514 char [] generateCodeForSSE(int SSEVer, int [] ranklist, char [] infixOperations, char cumulatingOp=0) 527 515 { 528 516 char [] operations = makePostfixForSSE(infixOperations, ranklist); … … 531 519 result ~= "// Operation : " ~ operations ~ \n; 532 520 533 // Create local variables for pointers to vectors (avoid bug #1125) 534 // Bad code is also generated for loading scalars. 535 int vecnum = 0; 536 for (int i=0; i< ranklist.length;++i) { 537 if (ranklist[i]==1){ 538 result~= " auto vec" ~ itoa(i) ~ " = values[" ~itoa(i) ~"].ptr; // " ~ cast(char)('A'+i)~ \n; 539 ++vecnum; 540 } 541 } 542 result ~= " int veclength = values[" ~itoa(findFirstVector(ranklist)) ~"].length;\n"; 543 521 int numvecs = countVectors(ranklist); 544 522 int numScalarsOnStack=0; 545 523 546 result~= \n"asm {"\n ~ pushRegisters( vecnum);524 result~= \n"asm {"\n ~ pushRegisters(numvecs); 547 525 // EAX will be the counter 548 526 result ~= " mov EAX, veclength;"\n; 549 527 // Load all the vector pointers into registers 550 528 551 const char [] vectorsize = "8"; // size of a double 552 int numvecs=0; 529 char [] vectorsize = (SSEVer == 2) ? "8" :"4"; // size of a double 530 531 int vecregnum = 0; 553 532 int numconsts=0; 554 533 for (int i=0; i<ranklist.length; ++i) { 555 534 if (ranklist[i]==1) { 556 result ~= " lea " ~ vectorRegister[ numvecs]535 result ~= " lea " ~ vectorRegister[vecregnum] 557 536 ~ ", [" ~ vectorsize ~ "*EAX]; " 558 ~ " add " ~ vectorRegister[ numvecs] ~ ", vec" ~ itoa(i) ~ ";";537 ~ " add " ~ vectorRegister[vecregnum] ~ ", values[" ~ itoa(i) ~ "];"; 559 538 result ~= " //" ~ cast(char)('A'+i) ~ \n; 560 ++ numvecs;539 ++vecregnum; 561 540 } else if (ranklist[i]==0) { 541 // load scalar into an XMM register, and then duplicate it into both 542 // halves of the register using shufpd. 562 543 result ~= " movsd " ~ XMM(numconsts) ~ ", values["~ itoa(i) ~"]; " 563 // result ~= " movsd " ~ XMM(numconsts) ~ ", double ptr val"~ itoa(i) ~"; "564 544 " shufpd " ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; 565 545 ++numconsts; … … 614 594 ~ "L1:\n" ~ mainbody; 615 595 result ~= " add EAX,2;\n" ~ " js L1;\n"; 616 result~= "L3:" \n ~ popRegisters( vecnum) ~ "}\r\n";596 result~= "L3:" \n ~ popRegisters(numvecs) ~ "}\r\n"; 617 597 618 598 return result;
