Changeset 187 for trunk/blade/CodegenX86.d
- Timestamp:
- 04/30/08 16:05:32 (5 months ago)
- Files:
-
- trunk/blade/CodegenX86.d (modified) (27 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/blade/CodegenX86.d
r172 r187 1 1 // Written in the D programming language 1.0 2 2 /** 3 * BLADE 0.4Alpha -- Basic Linear Algebra D Expressions3 * BLADE Alpha -- Basic Linear Algebra D Expressions 4 4 * 5 5 * Generate near-optimal x87/SSE/SSE2 asm code for BLAS1 basic vector operations … … 21 21 * 22 22 * BUGS/ FUTURE DIRECTIONS: 23 * None of these support dot product, ormatrix operations.23 * None of these support matrix operations. 24 24 * X87: 25 25 * - Not optimal for the case of multiple real vectors (they could share a counter). … … 30 30 * (to do this, need naked asm with no stack frame). 31 31 * SSE/SSE2: 32 * - SSE functions don't support unaligned data. Need to generate seperate code 32 * - SSE functions don't support unaligned data. Need to generate seperate code 33 33 * for that case (NOTE: probably only worth doing for small expressions). 34 34 * … … 191 191 // (max # temporaries + max # real scalars) must be <=8, otherwise FPU stack 192 192 // will overflow). 193 const int MAX_87_REALSCALARSPLUSTEMPORARIES = 8; 193 const int MAX_87_REALSCALARSPLUSTEMPORARIES = 8; 194 194 195 195 private: … … 213 213 // indexed by i. 214 214 char [] indexedVector(char [][] typelist, char [] ranklist, char [] stridelist, char var) 215 { 215 { 216 216 if (typelist[var-'A']=="real") return " real ptr [" ~ vectorRegister[vectorNum(ranklist, var)] ~ "]"; 217 217 else if (stridelist[var-'A']=='1') return operandSize(typelist[var-'A']) ~ "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ "]"; … … 264 264 (Pentium, PMMX, PII, PIII). It is also optimal for recent x86 CPUs 265 265 where vector sizes are mixed. 266 266 267 267 There are two cases: 268 268 (A) DAXPY-style loops, where every element is independent of the other indices; … … 271 271 For cumulative loops, best performance is achieved with loop unrolling and 272 272 multiple accumulators, in order to break dependency chains. 273 273 274 274 The key optimisation rules for DAXPY loops are: 275 275 1. keep the loop overhead to one clock cycle if possible. … … 304 304 ranklist~="1"; 305 305 typelist ~= typeof(T[0]).stringof; 306 } else static if (is(typeof(T.data))) { 306 } else static if (is(typeof(T.data))) { 307 307 stridelist~="1"; 308 308 ranklist~="1"; … … 323 323 char [] result=""; 324 324 char [] incrementRealVectors=""; 325 325 326 326 result ~= "// Operation : " ~ operations ~ \n; 327 327 328 328 // Create local variables for pointers to vectors (avoid bug #1125) 329 329 … … 361 361 ~ " add " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 362 362 } 363 result ~= " //" ~ cast(char)('A'+i) ~ \n; 363 result ~= " //" ~ cast(char)('A'+i) ~ \n; 364 364 ++numvecs; 365 365 } else if (typelist[i]=="real") { … … 367 367 ++numconsts; 368 368 ++numScalarsOnStack; 369 result ~= " //" ~ cast(char)('A'+i) ~ \n; 369 result ~= " //" ~ cast(char)('A'+i) ~ \n; 370 370 } 371 371 } … … 376 376 int numOnStack = 0; // How much of the FP stack is being used? 377 377 378 bool isCumulative = (operations[0]=='0' );378 bool isCumulative = (operations[0]=='0' || operations[0]=='1'); 379 379 if (operations[0]=='0') { 380 380 result ~= " fldz;"\n; // dot product 381 381 ++numOnStack; 382 382 done = 1; 383 } else if (operations[0]=='1') { 384 result ~= " fld1;"\n; // prod 385 ++numOnStack; 386 done = 1; 383 387 } 384 388 result ~= " xor EAX, EAX; "\n … … 389 393 // the final storage instruction, because of the FST latency). 390 394 char [] mainbody = ""; 391 395 392 396 while(done<operations.length) { 393 397 char [] next; … … 420 424 } else if (operations[done]==',') { 421 425 mainbody ~= " " ~ opToX87[operations[done+1]] ~ " ST, ST(0); // dup " ~ operations[done+1] ~ \n; 422 done+=2; 426 done+=2; 423 427 } else if (ranklist[operations[done]-'A']=='1') { 424 428 // An operation will be performed between the stack top and a vector. … … 430 434 // it chains. 431 435 next = ((done+2 == operations.length)? " fstp " : " fst ") 432 ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment; 436 ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment; 433 437 } else if (typelist[operations[done]-'A']=="real") { 434 438 // 80-bit vectors must be loaded onto the FPU stack first … … 445 449 // Multiply by real scalar, which is already on the stack. 446 450 next = " fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - realScalarNum(typelist, ranklist, operations[done]-'A')-1) ~ "); // * " ~ operations[done] ~ \n; 447 mainbody ~= next; 451 mainbody ~= next; 448 452 } else { 449 453 // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. … … 452 456 } 453 457 done +=2; 454 } 455 } 456 457 result ~= \n 458 ~ " align 4;\n" 458 } 459 } 460 461 result ~= \n 462 ~ " align 4;\n" 459 463 ~ "L1:\n" ~ mainbody; 460 464 461 465 // if (cumulatingOp) result ~= " " ~ opToX87[cumulatingOp] ~ "p ST(2), ST;"\n; 462 466 … … 472 476 473 477 result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 474 478 475 479 return result; 476 480 } … … 502 506 // Note: If SSE4 is available, the dppd and dpps instructions could be 503 507 // used to replace the final *+ in dot-product operations. This would allow 504 // an in-order dot product to be performed, but doesn't give much speed508 // an in-order dot product to be performed, but otherwise doesn't give much speed 505 509 // improvement. 506 510 … … 511 515 { 512 516 char [] result=""; 513 517 514 518 result ~= "// Operation : " ~ operations ~ \n; 515 519 516 520 int numvecs = countVectors(ranklist); 517 521 int numScalarsOnStack=0; 518 bool isCumulative = (operations[0]=='0') ;522 bool isCumulative = (operations[0]=='0') || (operations[0]=='1'); 519 523 if (isCumulative) result ~= (usingDoubles? " double" : " float") ~" sum;"\n; 520 524 … … 526 530 char [] vectorsize = usingDoubles? "8" :"4"; // size of a double 527 531 char [] suffix = usingDoubles? "d " :"s "; 528 532 529 533 int vecregnum = 0; 530 534 int numconsts=0; … … 534 538 ~ ", [" ~ vectorsize ~ "*EAX]; " 535 539 ~ " add " ~ vectorRegister[vecregnum] ~ ", values[" ~ itoa(i) ~ "];"; 536 result ~= " //" ~ cast(char)('A'+i) ~ \n; 540 result ~= " //" ~ cast(char)('A'+i) ~ \n; 537 541 ++vecregnum; 538 542 } else if (ranklist[i]==0) { 539 543 // load scalar into an XMM register, and then duplicate it into both 540 // halves (or quarters) of the register using shufpd. 544 // halves (or quarters) of the register using shufpd. 541 545 result ~= " movs" ~ suffix ~ XMM(numconsts) ~ ", values[" ~ itoa(i) ~ "]; " 542 546 " shufp" ~ suffix ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; … … 553 557 // reusing the same registers, because the CPUs which support SSE 554 558 // also have extensive support for register renaming. 555 559 556 560 int numOnStack = numScalarsOnStack; // How much of the FP stack is being used? 557 561 int done=0; 558 562 if (operations[0]=='0') { 559 563 result ~= " pxor " ~ XMM(numOnStack) ~ "," ~ XMM(numOnStack) ~ "; // 0\n"; 564 ++numOnStack; 565 ++done; 566 } else if (operations[0]=='1') { 567 result ~= " movap" ~ suffix ~ XMM(numOnStack) ~ ", SSE_ONEp" ~ suffix ~ "; // 1\n"; 560 568 ++numOnStack; 561 569 ++done; … … 599 607 extra ~= " " ~ opToSSESingle[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 600 608 ~ XMM(numOnStack-1) ~ comment; 601 done +=2; 609 done +=2; 602 610 } else if (ranklist[operations[done]-'A']=='1') { 603 611 // An operation will be performed between the stack top and a vector. … … 619 627 extra ~= " " ~ opToSSESingle[operations[done+1]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(u) ~ comment; 620 628 done +=2; 621 } 622 } 623 624 result ~= \n 625 ~ " align 16;\n" 629 } 630 } 631 632 result ~= \n 633 ~ " align 16;\n" 626 634 ~ "L1:\n" ~ mainbody; 627 635 if (usingDoubles) { … … 640 648 if (isCumulative) { 641 649 // Result is now in XMM(numScalarsOnStack). We need to do a horizontal 642 // add to get the final sum. 650 // add or multiply to get the final sum. 651 char [] cumInstr = operations[0]=='0' ? " add" : " mul"; 643 652 if (usingDoubles) { 644 653 // For SSE3, use haddpd XMM(numScalarsOnStack). 645 654 result ~= " movhlps " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ";"\n 646 ~ " addsd " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n"; 655 ~ cumInstr ~ "sd " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n"; 656 657 647 658 } else { // floats 648 659 result ~= " movhlps " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ";"\n 649 ~ " addps " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n"660 ~ cumInstr ~ "ps " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n" 650 661 ~ " pshufd " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ",1;"\n 651 ~ " addss " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n";662 ~ cumInstr ~ "ss " ~ XMM(numScalarsOnStack) ~ "," ~ XMM(numScalarsOnStack+1) ~ ";\n"; 652 663 } 653 664 result ~= " movs" ~ suffix ~ " sum," ~ XMM(numScalarsOnStack) ~ ";"\n; … … 656 667 result ~= popRegisters(numvecs) ~ "}\n"; 657 668 if (isCumulative) result ~= " return sum;"\n; 658 669 659 670 return result; 660 671 }
