Changeset 126
- Timestamp:
- 11/02/07 03:08:28 (10 months ago)
- Files:
-
- trunk/blade/Blade.d (modified) (5 diffs)
- trunk/blade/BladeDemo.d (modified) (3 diffs)
- trunk/blade/CodegenX86.d (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/blade/Blade.d
r125 r126 15 15 * - Supports any mix of vector addition, subtraction, dot product, and multiplication 16 16 * by a scalar. 17 * - Generates either x87 asm code, SSE 2 asm code or pure D, depending on the complexity of18 * the expression, and the availability of inline asm.17 * - Generates either x87 asm code, SSE or SSE2 asm code or pure D, depending on 18 * the complexity of the expression, and the availability of inline asm. 19 19 * - When static arrays are used, mismatches in array length are detected 20 20 * at compile time. … … 23 23 * 24 24 * FUTURE DIRECTIONS (in order of expected implementation): 25 * - SSE1 code could be used for float operations.26 25 * - Dot product (which was present in BLADE 0.2). 27 26 * - The x87 code generation targets early Pentiums, which are now irrelevant. … … 171 170 } 172 171 173 /// Generate code which will call the SSE 2function174 char [] invokeSSE 2(AbstractSyntaxTree tree)172 /// Generate code which will call the SSE/SSE2 code generation function 173 char [] invokeSSE(bool SSE2, AbstractSyntaxTree tree) 175 174 { 176 175 char [] result = assertAllVectorLengthsEqual(tree); 176 result ~= assertAllVectorsAlign128(tree); 177 177 178 result ~= "SSEVECGEN!( 2," ~ wrapInQuotes(tree.expression);178 result ~= "SSEVECGEN!(" ~ (SSE2?"2":"1") ~ "," ~ wrapInQuotes(tree.expression); 179 179 // For SSE2, everything must be implicitly convertible to double. 180 180 for (int i=0; i<tree.symbolTable.length;++i) { 181 if (tree.symbolTable[i].rank==0) result ~= ",double"; 182 else result ~= ",double*"; 181 if (SSE2) { 182 if (tree.symbolTable[i].rank==0) result ~= ",double"; 183 else result ~= ",double*"; 184 } else { 185 if (tree.symbolTable[i].rank==0) result ~= ",float"; 186 else result ~= ",float*"; 187 } 183 188 } 184 189 result ~= ")("; … … 219 224 } 220 225 226 char [] assertAllVectorsAlign128(AbstractSyntaxTree tree) 227 { 228 char [] result =""; 229 for (int i=0; i<tree.symbolTable.length;++i) { 230 if (tree.symbolTable[i].rank==1){ 231 result ~= "assert( (cast(size_t)(" ~ tree.symbolTable[i].value 232 ~ ".ptr)& 0x1F) == 0, `SSE Vector misalignment: " ~ tree.symbolTable[i].value ~ "`);"\n; 233 } 234 } 235 return result; 236 } 237 221 238 // Return true if the type has a length which is known at compile time 222 239 bool arrayLengthIsStatic(char [] type) … … 263 280 { 264 281 VecExpressionType exprType = categorizeExpression(tree); 265 if (exprType == VecExpressionType.SSE2Expression ) {266 return invokeSSE 2(tree);282 if (exprType == VecExpressionType.SSE2Expression || exprType == VecExpressionType.SSE1Expression) { 283 return invokeSSE((exprType == VecExpressionType.SSE2Expression), tree); 267 284 } else if (exprType == VecExpressionType.X87Expression) { 268 285 return invokeX87(tree); trunk/blade/BladeDemo.d
r125 r126 10 10 11 11 // Unfortunately static arrays in D aren't aligned to 128-bit boundaries yet. 12 // This causes segfaults if trying to use SSE. For now, only use heap arrays 13 // with SSE/SSE2. 12 // In such cases, the library generates an 'SSE misalignment' assert error, 13 // to avoid segfaults. 14 // For now, only use heap arrays with SSE/SSE2. 14 15 15 16 void main() … … 22 23 d[0..$] = [17.0, 28.25, 1, 56.2];; 23 24 auto w2=z.ptr; 24 float[] q = [17.0f, 28.25, 1, 0]; 25 float[] q = new float[4]; 26 q[0..$]= [17.0f, 28.25, 1, 0]; 25 27 float [4] r; 26 28 idouble [] p = [2.3i, 254i, 0.1i, 1.2i]; … … 32 34 mixin(vectorize(" a += d*2.01")); 33 35 mixin(vectorize(" a += r*2.01")); 36 mixin(vectorize(" q += q*2.01")); 34 37 35 38 writefln("a=", a); 36 double c = 1.0i * 2.0i;37 idouble w = cast(idouble)c;38 pragma(msg, typeof(c).stringof);39 39 } trunk/blade/CodegenX86.d
r125 r126 3 3 * BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 * Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. 5 * Generate near-optimal x87/SSE/SSE2 asm code for BLAS1 basic vector operations 6 * at compile time. 6 7 * 32, 64 and 80 bit vectors are all supported. 7 8 * Uses techniques described in Agner Fog's superb Pentium optimisation manual (www.agner.org). … … 15 16 * - Supports any mix of vector addition, subtraction, dot product, and multiplication 16 17 * by a scalar. 17 * - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 18 * the expression, and the availability of inline asm. 18 * - Generates either x87, SSE, or SSE2 asm code. 19 19 * - If x87 code is generated, 80-bit precision is used whenever possible. 20 20 * - Supports mixed-length operations (eg, real[] + double[] + float[]). 21 21 * 22 *23 22 * BUGS/ FUTURE DIRECTIONS: 23 * None of these support dot product, or matrix operations. 24 * X87: 24 25 * The x87 code generation targets early Pentiums, which are now irrelevant. 25 26 * It needs to be updated to PM/Core2 (this will significantly simplify it). … … 30 31 * - Doesn't use EBP register -- this would allow an extra vector in expressions. 31 32 * (to do this, need naked asm with no stack frame). 33 * SSE/SSE2: 34 * - SSE functions don't support unaligned data. Need to generate seperate code 35 * for that case (NOTE: probably only worth doing for small expressions). 32 36 * 33 37 * THEORY: … … 230 234 return ['*':"fmul"[], '+': "fadd", '-': "fsub", '_': "fsubr"]; } 231 235 232 char [][char] opToSSE2() {233 return ['*':"mulpd"[], '+': "addpd", '-': "subpd", '/': "divpd"]; }234 235 236 char [][char] opToSSE() { 236 return ['*':"mulps"[], '+': "addps", '-': "subps", '/': "divps"]; } 237 237 return ['*':"mulp"[], '+': "addp", '-': "subp", '/': "divp"]; } 238 239 char [][char] opToSSESingle() { 240 return ['*':"muls"[], '+': "adds", '-': "subs", '/': "divs"]; } 238 241 239 242 static if (real.sizeof==10) const char [] REALSIZE = "10"; … … 298 301 } 299 302 300 char [] indexedSSEVector(int [] ranklist, char var) 301 { 302 return "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ " + 8*EAX]"; 303 char [] indexedSSEVector(int [] ranklist, char var, char [] vecsize) 304 { 305 return "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ " + " ~ vecsize ~"*EAX]"; 306 } 307 308 char [] indexedSSENext(int [] ranklist, char var, char [] vecsize) 309 { 310 return "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ " + " ~ vecsize ~"*EAX+16]"; 303 311 } 304 312 … … 509 517 public: 510 518 511 /** Generate BLAS1 asm code which is optimal for CPUs with SSE2. 512 * We don't need types for SSE2. Everything is a double; vectors are double* 519 /** Generate BLAS1 asm code using SSE or SSE2. 520 * For SSE2, all scalars are double, vectors are double*; for SSE1, all are float. 521 * At entry, all vector parameters are aligned. 513 522 */ 514 523 char [] generateCodeForSSE(int SSEVer, int [] ranklist, char [] infixOperations, char cumulatingOp=0) … … 528 537 529 538 char [] vectorsize = (SSEVer == 2) ? "8" :"4"; // size of a double 539 char [] suffix = (SSEVer == 2) ? "d " :"s "; 530 540 531 541 int vecregnum = 0; … … 540 550 } else if (ranklist[i]==0) { 541 551 // load scalar into an XMM register, and then duplicate it into both 542 // halves of the register using shufpd.543 result ~= " movs d " ~ XMM(numconsts) ~ ", values["~ itoa(i) ~"]; "544 " shufp d "~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n;552 // halves (or quarters) of the register using shufpd. 553 result ~= " movs" ~ suffix ~ XMM(numconsts) ~ ", values[" ~ itoa(i) ~ "]; " 554 " shufp" ~ suffix ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; 545 555 ++numconsts; 546 556 ++numScalarsOnStack; … … 549 559 result ~= " xor EAX, EAX; "\n 550 560 " sub EAX, veclength; // counter=-length"\n 551 " jz short L 3; // test for length==0"\n;561 " jz short L2; // test for length==0"\n; 552 562 int done=0; 553 563 554 564 char [] mainbody = ""; 565 char [] extra = ""; // for the extra length%4 calculations 555 566 556 567 // The SSE implementation mimics the x87 version. Instead of keeping track of … … 561 572 int numOnStack = numScalarsOnStack; // How much of the FP stack is being used? 562 573 while(done<operations.length) { 563 char [] next;574 char [] comment; 564 575 if (isInstruction(operations[done])) { 565 576 // Perform an arithmetic operation on the top two items. 566 next = " " ~ opToSSE2[operations[done]] ~ XMM(numOnStack-1) ~ ", " ~ XMM(numOnStack) ~ "; //" ~ operations[done] ~ \n; 567 mainbody ~= next; 577 comment = "; //" ~ operations[done] ~ \n; 578 mainbody ~= " " ~ opToSSE[operations[done]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(numOnStack) ~ comment; 579 extra ~= " " ~ opToSSESingle[operations[done]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(numOnStack) ~ comment; 568 580 ++done; 569 581 numOnStack--; 570 582 } else if (!isInstruction(operations[done+1])){ 571 583 // load a vector onto the FPU stack, to begin a new subexpression. 572 int u = operations[done]-'A';573 next = " movapd " ~ XMM(numOnStack) ~ ", " ~ indexedSSEVector(ranklist, operations[done] ) ~ "; // " ~ operations[done..done+1] ~ \n;574 mainbody ~= next;584 comment = "; // " ~ operations[done] ~ \n; 585 mainbody ~= " movap" ~ suffix ~ XMM(numOnStack) ~ ", " ~ indexedSSEVector(ranklist, operations[done], vectorsize) ~ comment; 586 extra ~= " movs" ~ suffix ~ XMM(numOnStack) ~ ", " ~ indexedSSENext(ranklist, operations[done], vectorsize) ~ comment; 575 587 ++done; 576 588 numOnStack++; 577 589 } else if (ranklist[operations[done]-'A']==1) { 578 590 // An operation will be performed between the stack top and a vector. 579 // If it's a float or double, we can combine the load+arithmetic op 580 // into a single instruction. 581 if (operations[done+1]=='=') mainbody ~= " movapd " ~ indexedSSEVector(ranklist, operations[$-2] ) ~ ", XMM" ~ itoa(numOnStack-1) ~"; // " ~ operations[$-2..$] ~ \n; 582 else mainbody ~= " " ~ opToSSE2[operations[done+1]] ~ " " ~ XMM(numOnStack-1) ~ ", " 583 ~ indexedSSEVector(ranklist, operations[done] ) ~ "; // " ~ operations[done..done+2] ~ \n; 591 comment = "; // " ~ operations[done..done+2] ~ \n; 592 if (operations[done+1]=='=') { 593 mainbody ~= " movap" ~ suffix ~ indexedSSEVector(ranklist, operations[$-2], vectorsize) ~ ", XMM" ~ itoa(numOnStack-1) ~ comment; 594 extra ~= " movs" ~ suffix ~ indexedSSENext(ranklist, operations[$-2], vectorsize) ~ ", XMM" ~ itoa(numOnStack-1) ~ comment; 595 } else { 596 mainbody ~= " " ~ opToSSE[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 597 ~ indexedSSEVector(ranklist, operations[done], vectorsize) ~ comment; 598 extra ~= " " ~ opToSSESingle[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 599 ~ indexedSSENext(ranklist, operations[done], vectorsize) ~ comment; 600 } 584 601 done+=2; 585 602 } else { // multiply by scalar. 586 next = " " ~ opToSSE2[operations[done+1]] ~ " " ~ XMM(numOnStack-1) ~ ", " ~ XMM(scalarNum(ranklist, operations[done]-'A')) ~"; // " ~operations[done..done+2] ~ \n; 587 mainbody ~= next; 588 done +=2; 603 comment = "; // " ~operations[done..done+2] ~ \n; 604 int u = scalarNum(ranklist, operations[done]-'A'); 605 mainbody ~= " " ~ opToSSE[operations[done+1]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(u) ~ comment; 606 extra ~= " " ~ opToSSESingle[operations[done+1]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(u) ~ comment; 607 done +=2; 589 608 } 590 609 } … … 593 612 ~ " align 16;\n" 594 613 ~ "L1:\n" ~ mainbody; 595 result ~= " add EAX,2;\n" ~ " js L1;\n"; 596 result~= "L3:" \n ~ popRegisters(numvecs) ~ "}\r\n"; 614 if (SSEVer == 2) { 615 result ~= " add EAX,2;\n js L1;\n" 616 ~ "L2:\n sub EAX, 2;\n jns L4;\n" 617 // Now the calculations for the final double, if any. 618 ~ extra; 619 } else { 620 result ~= " add EAX,4;\n" ~ " js L1;\n" 621 ~ "L2:\n sub EAX, 4;\n jns L4;\n" 622 // Now the extra calculations for the 0-3 float, or 0-1 doubles 623 ~ "L3:"\n ~ extra 624 ~ " add EAX,1;\n js L3;\n"; 625 } 626 result~= "L4:" \n ~ popRegisters(numvecs) ~ "}\n"; 597 627 598 628 return result;
