Changeset 92
- Timestamp:
- 04/03/07 05:18:59 (2 years ago)
- Files:
-
- trunk/mathextra/Blade.d (modified) (20 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/mathextra/Blade.d
r91 r92 20 20 * - When static arrays are used, mismatches in array length are detected at compile time. 21 21 * 22 * The code is optimal for early Pentium CPUs, or very nearly so. 22 * BUGS/ FUTURE DIRECTIONS: 23 * - Doesn't avoid AGI stalls. This can cost one cycle per iteration. 24 * Should be able to say: 25 * 'The code is optimal for early Pentium CPUs, or very nearly so. 23 26 * For example, the code generated for the crucial DAXPY operation is functionally 24 27 * identical to that described in Agner Fog's optimisation manual (www.agner.org), 25 * and is faster than the unrolled solution originally published by Intel. 26 * 27 * BUGS/ FUTURE DIRECTIONS: 28 * and is faster than the unrolled solution originally published by Intel.' 28 29 * - Not well tested. No unit tests are included! 29 30 * - Not optimal for the case of multiple real vectors (they could share a counter). … … 202 203 } 203 204 205 // Dot product of two vectors. 204 206 // Returns ireal if one of A or B is real, and the other is imaginary. 205 207 typeof(A.BaseType*B.BaseType) dot(A, B)(A a, B b) … … 234 236 // ------------------------------------------------ 235 237 238 // return the length of a sub-expression 236 239 int exprLength(char [] s) 237 240 { … … 244 247 } 245 248 249 // Converts an infix string into postfix 246 250 char [] makePostfix(char [] operations) 247 251 { … … 260 264 } 261 265 262 // Apply x87-specific optimisations while converting to postfix 266 // Converts an infix string into postfix. 267 // Apply x87-specific optimisations during the conversion. 263 268 char [] makePostfixForX87(char [] operations, char [] typelist) 264 269 { … … 385 390 } 386 391 387 388 392 static if (real.sizeof==10) const char [] REALSIZE = "10"; 389 393 else static if (real.sizeof==12) const char [] REALSIZE = "12"; … … 403 407 const char [][5] vectorRegister = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 404 408 409 // Create code to push all used vector registors. 405 410 char [] pushRegisters(int numVectors) 406 411 { … … 410 415 } 411 416 417 // Create code to pop all used vector registors. 412 418 char [] popRegisters(int numVectors) 413 419 { … … 424 430 } 425 431 426 char [] storeVector(char type, int vecnum, char [] stride="") 427 { 432 char [] storeVector(char type, int vecnum) 433 { 434 char [] stride = " - " ~ vectorSize(type); 428 435 if (type=='R') { 429 436 return " fstp real ptr [" ~ vectorRegister[vecnum] ~ stride ~ "];"\n; … … 433 440 } 434 441 435 /** Generate asm code which is optimal for x87 CPUs without SSE2 442 /** Generate asm code which is optimal for x87 CPUs without SSE2. It is also 443 optimal for recent x86 CPUs where vector sizes are mixed. 436 444 (Pentium, PMMX, PII, PIII). 437 445 The key optimisation rules are: … … 439 447 2. (FMUL latency) don't use the result of a multiply immediately 440 448 3. (FST latency) don't save a value to memory immediately after it's calculated. 449 4. (AGI stall) don't use the counter variable immediately after it's modified. 441 450 Techniques to address these are: 442 451 1. Use ESI as a counter and index variable, which begins negative and counts UP to zero. … … 444 453 3. The latency of fstp is avoided by calculating a result in one iteration, 445 454 but not storing it to memory until the subsequent iteration. 455 4. (NOT YET IMPLEMENTED): first operation in the loop should be loading a scalar (for a multiply), 456 if possible, otherwise load an 80-bit vector, if possible. 446 457 447 458 The generated code is of the form: … … 537 548 char [] next; 538 549 if (isInstruction(operations[done])) { 550 // Perform an arithemetic operation on the top two FPU stack items. 539 551 next = " " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 540 552 mainbody ~= next; firstbody ~= next; … … 542 554 numOnStack--; 543 555 } else if (!isInstruction(operations[done+1])){ 556 // load a vector onto the FPU stack, to begin a new subexpression. 544 557 int u = operations[done]-'a'; 545 558 next = " fld " ~ indexedVector(typelist, operations[done] ) ~ ";\n"; … … 548 561 numOnStack++; 549 562 } else if (isVector(typelist[operations[done]-'a'])) { 563 // An operation will be performed between the stack top and a vector. 564 // If it's a float or double, we can combine the load+arithmetic op 565 // into a single instruction. 550 566 if (typelist[operations[done]-'a']=='R') { 551 567 // 80-bit vectors must be loaded onto the FPU stack first … … 576 592 ++numOnStack; 577 593 } 594 // the last operation is special, because it may involve the 595 // destination vector (+=, -=, *=). 578 596 if (!isDotProduct && finaloperation.length>1) { 579 597 if (finaloperation[0]=='*') { … … 584 602 char [] finalop = "fadd"; 585 603 if (finaloperation[0]=='-') finalop="fsubr"; 604 char [] next; 586 605 if (typelist[$-1]=='R') { 587 606 // 80-bit vectors must be loaded onto the FPU stack first 588 mainbody ~= " fld real ptr [" ~ vectorRegister[numvecs-1] ~ "];"\n;589 mainbody~= " " ~ finalop ~ "p ST(1), ST;\n";607 next = " fld real ptr [" ~ vectorRegister[numvecs-1] ~ "];"\n; 608 next ~= " " ~ finalop ~ "p ST(1), ST;\n"; 590 609 } else { 591 mainbody ~= " " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + "610 next = " " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " 592 611 ~ vectorSize(typelist[$-1]) ~ "*ESI];"\n; 593 612 } 613 mainbody ~=next; firstbody~=next; 594 614 } 595 615 } … … 600 620 if (isDotProduct) result ~= " faddp ST(2), ST;"\n; 601 621 else { 602 result ~= storeVector(typelist[$-1], numvecs-1 , " - " ~ vectorSize(typelist[$-1]));622 result ~= storeVector(typelist[$-1], numvecs-1); 603 623 } 604 624 605 625 result ~= "L2: \n"; 606 626 627 // Update the counters 607 628 result~= incrementRealVectors ~ " inc ESI;\n jnz L1;\n"; 608 629 630 // Store the result from the final iteration 609 631 if (isDotProduct) result ~= " faddp ST(1), ST;"\n; 610 632 else result ~= storeVector(typelist[$-1], numvecs-1); 611 633 612 // Discard any scalars that are left on the stack.634 // Discard any scalars that are left on the stack 613 635 if (isDotProduct && numScalarsOnStack>0) { 614 636 // Preserve the result of the dot product … … 616 638 } 617 639 while (numScalarsOnStack>1) { 618 result~= " fcompp ST(0), ST;"\n; 640 result~= " fcompp ST(0), ST;"\n; // pop two values at once 619 641 numScalarsOnStack-=2; 620 642 } … … 634 656 auto p = Vec([1.0L, 2, 18]); 635 657 auto q = Vec([3.5L, 1.1, 3.8]); 636 auto r = Vec([17.0f, 28. 1, 1]);658 auto r = Vec([17.0f, 28.25, 1]); 637 659 auto z = Vec([17.0i, 28.1i, 1i]); 638 q -= ((r+p)*18.0L*314.1L - (p-r))* 35;639 660 real d = dot(r, p+r+r); 661 assert(d==2267.625); 640 662 ireal e = dot(r, z); 641 663 writefln(d, " ", e); 664 665 q -= ((r+p)*18.0L*314.1L - (p-r))* 35; 666 d = dot(r, p+r+r); 667 writefln(d, " ", e); 668 assert(d==2267.625); 669 /* 642 670 p = r - q*2.0; 643 671 p*=5.6L; 644 672 z = (r*3.1i + r*5.0i)*7.1; 645 673 z*=3.1; 646 } 674 */ 675 }
