Changeset 92

Show
Ignore:
Timestamp:
04/03/07 05:18:59 (2 years ago)
Author:
Don Clugston
Message:

Improved comments.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/mathextra/Blade.d

    r91 r92  
    2020*  - When static arrays are used, mismatches in array length are detected at compile time. 
    2121* 
    22 * The code is optimal for early Pentium CPUs, or very nearly so. 
     22* BUGS/ FUTURE DIRECTIONS: 
     23*  - Doesn't avoid AGI stalls. This can cost one cycle per iteration. 
     24* Should be able to say: 
     25* 'The code is optimal for early Pentium CPUs, or very nearly so. 
    2326* For example, the code generated for the crucial DAXPY operation is functionally 
    2427* identical to that described in Agner Fog's optimisation manual (www.agner.org), 
    25 * and is faster than the unrolled solution originally published by Intel. 
    26 
    27 * BUGS/ FUTURE DIRECTIONS: 
     28* and is faster than the unrolled solution originally published by Intel.' 
    2829*  - Not well tested. No unit tests are included! 
    2930*  - Not optimal for the case of multiple real vectors (they could share a counter). 
     
    202203} 
    203204 
     205// Dot product of two vectors. 
    204206// Returns ireal if one of A or B is real, and the other is imaginary. 
    205207typeof(A.BaseType*B.BaseType) dot(A, B)(A a, B b) 
     
    234236// ------------------------------------------------ 
    235237 
     238// return the length of a sub-expression 
    236239int exprLength(char [] s) 
    237240{ 
     
    244247} 
    245248 
     249// Converts an infix string into postfix 
    246250char [] makePostfix(char [] operations) 
    247251{ 
     
    260264} 
    261265 
    262 // Apply x87-specific optimisations while converting to postfix 
     266// Converts an infix string into postfix. 
     267// Apply x87-specific optimisations during the conversion. 
    263268char [] makePostfixForX87(char [] operations, char [] typelist) 
    264269{ 
     
    385390} 
    386391 
    387  
    388392static if (real.sizeof==10)      const char [] REALSIZE = "10"; 
    389393else static if (real.sizeof==12) const char [] REALSIZE = "12"; 
     
    403407const char [][5] vectorRegister = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 
    404408 
     409// Create code to push all used vector registors. 
    405410char [] pushRegisters(int numVectors) 
    406411{ 
     
    410415} 
    411416 
     417// Create code to pop all used vector registors. 
    412418char [] popRegisters(int numVectors) 
    413419{ 
     
    424430} 
    425431 
    426 char [] storeVector(char type, int vecnum, char [] stride="") 
    427 
     432char [] storeVector(char type, int vecnum) 
     433
     434    char [] stride = " - " ~ vectorSize(type); 
    428435    if (type=='R') { 
    429436        return "  fstp real ptr [" ~ vectorRegister[vecnum] ~ stride ~ "];"\n; 
     
    433440} 
    434441 
    435 /** Generate asm code which is optimal for x87 CPUs without SSE2 
     442/** Generate asm code which is optimal for x87 CPUs without SSE2. It is also 
     443 optimal for recent x86 CPUs where vector sizes are mixed. 
    436444 (Pentium, PMMX, PII, PIII). 
    437445The key optimisation rules are: 
     
    439447 2. (FMUL latency) don't use the result of a multiply immediately 
    440448 3. (FST latency) don't save a value to memory immediately after it's calculated. 
     449 4. (AGI stall) don't use the counter variable immediately after it's modified. 
    441450Techniques to address these are: 
    442451 1. Use ESI as a counter and index variable, which begins negative and counts UP to zero. 
     
    444453 3. The latency of fstp is avoided by calculating a result in one iteration, 
    445454     but not storing it to memory until the subsequent iteration. 
     455 4. (NOT YET IMPLEMENTED): first operation in the loop should be loading a scalar (for a multiply), 
     456    if possible, otherwise load an 80-bit vector, if possible. 
    446457 
    447458The generated code is of the form: 
     
    537548        char [] next; 
    538549      if (isInstruction(operations[done])) { 
     550            // Perform an arithemetic operation on the top two FPU stack items. 
    539551            next = "  " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 
    540552            mainbody ~= next; firstbody ~= next; 
     
    542554            numOnStack--; 
    543555      } else if (!isInstruction(operations[done+1])){ 
     556            // load a vector onto the FPU stack, to begin a new subexpression. 
    544557            int u  = operations[done]-'a'; 
    545558            next = "  fld "  ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 
     
    548561            numOnStack++; 
    549562      } else if (isVector(typelist[operations[done]-'a'])) { 
     563         // An operation will be performed between the stack top and a vector. 
     564         // If it's a float or double, we can combine the load+arithmetic op 
     565         // into a single instruction. 
    550566         if (typelist[operations[done]-'a']=='R') { 
    551567             // 80-bit vectors must be loaded onto the FPU stack first 
     
    576592    ++numOnStack; 
    577593} 
     594    // the last operation is special, because it may involve the 
     595    // destination vector (+=, -=, *=). 
    578596    if (!isDotProduct && finaloperation.length>1) { 
    579597        if (finaloperation[0]=='*') { 
     
    584602            char [] finalop = "fadd"; 
    585603            if (finaloperation[0]=='-') finalop="fsubr"; 
     604            char [] next; 
    586605             if (typelist[$-1]=='R') { 
    587606                 // 80-bit vectors must be loaded onto the FPU stack first 
    588                 mainbody ~= "  fld real ptr ["  ~ vectorRegister[numvecs-1] ~ "];"\n; 
    589                 mainbody ~= "  " ~ finalop ~ "p ST(1), ST;\n"; 
     607                next = "  fld real ptr ["  ~ vectorRegister[numvecs-1] ~ "];"\n; 
     608                next ~= "  " ~ finalop ~ "p ST(1), ST;\n"; 
    590609             } else { 
    591                 mainbody ~= "  " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " 
     610                next = "  " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " 
    592611                ~ vectorSize(typelist[$-1]) ~ "*ESI];"\n; 
    593612            } 
     613            mainbody ~=next; firstbody~=next; 
    594614        } 
    595615    } 
     
    600620    if (isDotProduct) result ~= "  faddp ST(2), ST;"\n; 
    601621    else { 
    602         result ~= storeVector(typelist[$-1], numvecs-1, " - " ~ vectorSize(typelist[$-1])); 
     622        result ~= storeVector(typelist[$-1], numvecs-1); 
    603623    } 
    604624 
    605625    result ~= "L2: \n"; 
    606626 
     627    // Update the counters 
    607628    result~= incrementRealVectors ~ "  inc ESI;\n  jnz L1;\n"; 
    608629 
     630    // Store the result from the final iteration 
    609631    if (isDotProduct) result ~= "  faddp ST(1), ST;"\n; 
    610632    else result ~= storeVector(typelist[$-1], numvecs-1); 
    611633 
    612         // Discard any scalars that are left on the stack. 
     634    // Discard any scalars that are left on the stack 
    613635    if (isDotProduct && numScalarsOnStack>0) { 
    614636        // Preserve the result of the dot product 
     
    616638    } 
    617639    while (numScalarsOnStack>1) { 
    618         result~= "  fcompp ST(0), ST;"\n; 
     640        result~= "  fcompp ST(0), ST;"\n; // pop two values at once 
    619641        numScalarsOnStack-=2; 
    620642    } 
     
    634656    auto p = Vec([1.0L, 2, 18]); 
    635657    auto q = Vec([3.5L, 1.1, 3.8]); 
    636     auto r = Vec([17.0f, 28.1, 1]); 
     658    auto r = Vec([17.0f, 28.25, 1]); 
    637659    auto z = Vec([17.0i, 28.1i, 1i]); 
    638     q -= ((r+p)*18.0L*314.1L - (p-r))* 35; 
    639660    real d = dot(r, p+r+r); 
     661    assert(d==2267.625); 
    640662    ireal e = dot(r, z); 
    641663    writefln(d, " ", e); 
     664 
     665    q -= ((r+p)*18.0L*314.1L - (p-r))* 35; 
     666    d = dot(r, p+r+r); 
     667    writefln(d, " ", e); 
     668    assert(d==2267.625); 
     669/* 
    642670    p = r - q*2.0; 
    643671    p*=5.6L; 
    644672    z = (r*3.1i + r*5.0i)*7.1; 
    645673    z*=3.1; 
    646 
     674*/ 
     675