Changeset 91

Show
Ignore:
Timestamp:
03/26/07 04:48:51 (2 years ago)
Author:
Don Clugston
Message:

Improved comments.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/mathextra/Blade.d

    r90 r91  
    190190} 
    191191 
    192 VectorExpr!(X, "a", Q, X[]) Vec(X, int Q)(X[Q] vals) { VectorExpr!(X, "a", Q, X[]) a; a.values[0]=vals; return a; } 
    193 VectorExpr!(X, "a", 0, X[]) Vec(X)(X[] vals) { VectorExpr!(X, "a", 0, X[]) a; a.values[0]=vals; return a; } 
     192// Convert static arrays to dynamic, but remember the length as a compile-time parameter. 
     193VectorExpr!(X, "a", Q, X[]) Vec(X, int Q)(X[Q] vals) { 
     194    VectorExpr!(X, "a", Q, X[]) a; 
     195    a.values[0]=vals; 
     196    return a; 
     197
     198 
     199VectorExpr!(X, "a", 0, X[]) Vec(X)(X[] vals) { 
     200    VectorExpr!(X, "a", 0, X[]) a; 
     201    a.values[0]=vals; return a; 
     202
    194203 
    195204// Returns ireal if one of A or B is real, and the other is imaginary. 
     
    205214} 
    206215 
    207 // Note that this uses only built-in types. 
    208216real performOperation(char [] operations, char [] finaloperation, int knownlength, X...)(X expr) 
    209217{ 
     
    296304template singleType(A) 
    297305{ 
    298          static if (is(A == real[])|| is (A==ireal[]))   const char [] singleType = "R"; 
    299     else static if (is(A == double[])||is(A == idouble[])) const char [] singleType = "D"; 
    300     else static if (is(A == float[])|| is(A==ifloat[]))  const char [] singleType = "F"; 
     306         static if (is(A == real[])  || is(A==ireal[]))    const char [] singleType = "R"; 
     307    else static if (is(A == double[])|| is(A == idouble[]))const char [] singleType = "D"; 
     308    else static if (is(A == float[]) || is(A==ifloat[]))   const char [] singleType = "F"; 
    301309    else static if (is(A == real))   const char [] singleType = "S"; 
    302310    else const char [] singleType = "?"; 
    303311} 
    304312 
     313// A CTFE function can't randomly index a tuple, so convert the type information 
     314// into a char[]. 
    305315template vectorTupleToString(X...) 
    306316{ 
     
    332342} 
    333343 
     344 
     345int vectorNum(char [] typelist, char var) 
     346{ 
     347    int numVecs=0; 
     348    for (int i=0; i<var-'a'; ++i) { 
     349        if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F') ++numVecs; 
     350    } 
     351    return numVecs; 
     352} 
     353 
     354int scalarNum(char [] typelist, char var) 
     355{ 
     356    int k=0; 
     357    for (int i=0; i<var-'a'; ++i) { 
     358        if (typelist[i]=='S') ++k; 
     359    } 
     360    return k; 
     361} 
     362 
    334363// ------------------------------- 
    335364//   PART 4 -- Generate x87 ASM code 
     
    338367char [] operandSize(char var) 
    339368{ 
    340          if (var == 'R') return "real ptr "; 
    341     else if (var == 'D') return "double ptr "; 
    342     else if (var == 'F') return "float ptr "; 
    343     else if (var == 'S') return "(scalar)"; 
     369    switch(var) { 
     370        case 'R': return "real ptr "; 
     371        case 'D': return "double ptr "; 
     372        case 'F': return "float ptr "; 
     373    } 
    344374} 
    345375 
    346376char [] opToX87(char op) 
    347377{ 
    348     if (op=='*' || op=='.') return "fmul"; 
    349     else if (op=='+') return "fadd"; 
    350     else if (op=='-') return "fsub"; 
    351     else if (op=='~') return "fsubr"; 
    352 
     378    switch (op) { 
     379        case '*': 
     380        case '.': return "fmul"; 
     381        case '+': return "fadd"; 
     382        case '-': return "fsub"; 
     383        case '~': return "fsubr"; 
     384    } 
     385
     386 
     387 
     388static if (real.sizeof==10)      const char [] REALSIZE = "10"; 
     389else static if (real.sizeof==12) const char [] REALSIZE = "12"; 
     390else static if (real.sizeof==16) const char [] REALSIZE = "16"; 
    353391 
    354392char [] vectorSize(char vartype) 
    355393{ 
    356     if (vartype=='D') return "8"; 
    357     else if (vartype=='F') return "4"; 
    358     else if (vartype=='R') return REALSIZE; 
    359 
    360  
    361 int vectorNum(char [] typelist, char var) 
    362 
    363     int numVecs=0; 
    364     for (int i=0; i<var-'a'; ++i) { 
    365         if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F') ++numVecs; 
    366     } 
    367     return numVecs; 
    368 
    369  
    370 int scalarNum(char [] typelist, char var) 
    371 
    372     int k=0; 
    373     for (int i=0; i<var-'a'; ++i) { 
    374         if (typelist[i]=='S') ++k; 
    375     } 
    376     return k; 
     394    switch (vartype) { 
     395        case 'D': return "8"; 
     396        case 'F': return "4"; 
     397        case 'R': return REALSIZE; 
     398    } 
    377399} 
    378400 
     
    401423            vectorRegister[vectorNum(typelist, var)] ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; 
    402424} 
    403  
    404 static if (real.sizeof==10) const char [] REALSIZE="10"; 
    405 else static if (real.sizeof==12) const char [] REALSIZE="12"; 
    406 else static if (real.sizeof==16) const char [] REALSIZE="16"; 
    407425 
    408426char [] storeVector(char type, int vecnum, char [] stride="") 
     
    415433} 
    416434 
    417 // Generate asm code which is optimal for x87 CPUs without SSE2 
    418 // (Pentium, PMMX, PII, PIII). 
    419 // Uses ESI as a counter and index variable, which begins negative and counts UP to zero. 
    420 // The latency of fstp is avoided by storing it in the subsequent iteration. 
    421 // The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. 
     435/** Generate asm code which is optimal for x87 CPUs without SSE2 
     436 (Pentium, PMMX, PII, PIII). 
     437The key optimisation rules are: 
     438 1. keep the loop overhead to one clock cycle if possible. 
     439 2. (FMUL latency) don't use the result of a multiply immediately 
     440 3. (FST latency) don't save a value to memory immediately after it's calculated. 
     441Techniques to address these are: 
     442 1. Use ESI as a counter and index variable, which begins negative and counts UP to zero. 
     443 2. The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. 
     444 3. The latency of fstp is avoided by calculating a result in one iteration, 
     445     but not storing it to memory until the subsequent iteration. 
     446 
     447The generated code is of the form: 
     448---- 
     449 load scalars onto FPU stack 
     450 load vector pointers into EAX, EBX, ... 
     451 calculate result[0] into ST(0) 
     452 goto L2 
     453L1: 
     454 calculate result[i+1] into ST(0) 
     455 swap so that result[i] is in ST(0) 
     456L2: 
     457 store result[i] 
     458 increment pointers, goto L1 if i<n-1 
     459 store result[n-1] 
     460 pop scalars off FPU stack 
     461---- 
     462 
     463*/ 
    422464char [] makeAsmX87(char [] typelist, char [] operations, char [] finaloperation) 
    423465{ 
     
    453495        "  mov ESI, veclength;"\n; // ESI will be the counter 
    454496 
    455         // Load all the vector pointers into registers, and the scalars onto the stack 
     497    // Load all the vector pointers into registers, and push all the scalars onto the stack 
     498 
    456499    int numvecs=0; 
    457500    int numconsts=0; 
     
    473516    } 
    474517 
    475         if (isDotProduct) result ~= "  fldz;"\n; 
     518    if (isDotProduct) result ~= "  fldz;"\n; 
    476519    result ~= "  xor ESI, ESI; "\n 
    477520        "  sub ESI, veclength; // counter=-length"\n 
     
    484527    int done=0; 
    485528 
     529    // Construct the main body of the loop (the main body does not include 
     530    // the final storage instruction, because of the FST latency). 
    486531    char [] mainbody = ""; 
    487532    char [] firstbody = ""; 
     
    518563        // NOTE: For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 
    519564           // next = "  " ~ opToX87(operations[done+1]) ~ " double ptr var" ~ itoa(operations[done]-'a') ~";\n"; 
    520 //            mainbody ~= next; firstbody ~= next; 
     565           // mainbody ~= next; firstbody ~= next; 
    521566            done +=2; 
    522567      }