Changeset 88

Show
Ignore:
Timestamp:
03/21/07 07:01:22 (2 years ago)
Author:
Don Clugston
Message:

Support *=. Support real scalars. Tracks FP stack usage. Identifies vector lengths, when known.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/mathextra/Blade.d

    r87 r88  
    88*  - Supports any mix of vector addition, subtraction, dot product, and multiplication 
    99*    by a scalar. 
     10*  - 80-bit precision is used whereever possible (and it does not reduce execturion speed). 
    1011*  - Supports mixed-length operations (eg, real[] + double[] + float[]). 
    1112*  - When static arrays are used, mismatches in array length are detected at compile time. 
     
    1718* 
    1819* BUGS: 
    19 *  - Not well tested. 
    20 *  - Not optimal for 80-bit vectors (can save an instruction by changing float + real into real+float). 
     20*  - Not well tested. No unit tests are included! 
     21*  - Not always optimal for 80-bit vectors (can save an instruction by changing float[] + real[] into real[]+float[]). 
    2122*  - Not optimal for the case of multiple real vectors (they could share a counter). 
    22 *  - Doesn't support multiply by scalar real (need to keep track of the FP stack to do this). 
    23 *  - Doesn't yet support all combinations (eg vector*=scalar). 
     23*  - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required). 
    2424*  - Doesn't take advantage of length being known at compile time (loop unrolling 
    2525*     is possible). 
     
    9292    // ... and add it to all variables in the second expression, adding 
    9393    // parentheses if required. 
    94     if (second.length==1) return ret ~ op ~ cast(char)(nextarg+1); 
     94    if (second=="a") return ret ~ op ~ cast(char)(nextarg+1); 
    9595    ret ~= op ~ "("; 
    9696    for (int i=0; i<second.length; ++i) { 
     
    124124    static if (operations[$-2]=='*') { 
    125125        // Already a scalar multiply, so constant fold it. 
    126         VectorExpr!(operations, knownlength, B) opMul(double x) { 
     126        VectorExpr!(operations, knownlength, B) opMul(real x) { 
    127127            return VectorExpr!(operations, knownlength, B)(values[0..$-1], x*values[$-1]); 
    128128        } 
    129129    } else { 
    130         JoinResult!("*", "a", double) opMul(double x) { 
    131             return JoinResult!("*", "a", double)(values, x); 
     130        JoinResult!("*", "a", real) opMul(real x) { 
     131            return JoinResult!("*", "a", real)(values, x); 
    132132        } 
    133133    } 
     
    168168        performOperation!(expr.ops, "-=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values); 
    169169    } 
    170     void opMulAssign(A)(A w) { // We don't want to generate this code unless it's actually used. 
    171         static assert(is(A: double)); 
    172         performOperation!("a", "*=", knownlength, double, X[])(w, values); 
    173     } 
    174     VectorExpr!("a*b", knownlength, X[], double) opMul(double w) { 
    175         return VectorExpr!("a*b", knownlength, X[], double)(values, w); 
     170    void opMulAssign(A)(A w) { // Use a template, because we don't want to generate this code 
     171                               // unless it's actually used. 
     172        static assert(is(A: real)); 
     173        performOperation!("a", "*=", knownlength, real, X[])(w, values); 
     174    } 
     175    VectorExpr!("a*b", knownlength, X[], real) opMul(real w) { 
     176        return VectorExpr!("a*b", knownlength, X[], real)(values, w); 
    176177    } 
    177178    VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[]) opAdd(int T, Q)(Q[T] w) { 
     
    253254    else static if (is(A == double[])) const char [] singleType = "D"; 
    254255    else static if (is(A == float[]))  const char [] singleType = "F"; 
    255     else static if (is(A == double))   const char [] singleType = "S"; 
     256    else static if (is(A == real))   const char [] singleType = "S"; 
    256257    else const char [] singleType = "?"; 
    257258} 
     
    274275 
    275276 
    276 bool isVector(char [] typelist, char var) 
    277 
    278     int i = var-'a'; 
    279     return (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F'); 
     277bool isVector(char var) 
     278
     279    return (var=='R' || var=='D' || var=='F'); 
    280280} 
    281281 
     
    338338} 
    339339 
    340     // First, use the registers that don't need to be preserved. 
     340int scalarNum(char [] typelist, char var) 
     341
     342    int k=0; 
     343    for (int i=0; i<var-'a'; ++i) { 
     344        if (typelist[i]=='S') ++k; 
     345    } 
     346    return k; 
     347
     348 
     349// First, use the scratch registers (EAX, ECX, EDX). If that's not enough, 
     350// use EBX, ESI, and EDI. Finally, use the frame register EBP. 
    341351const char [][5] VectorRegisters = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 
    342352 
     
    370380else static if (real.sizeof==12) const char [] REALSIZE="12"; 
    371381else static if (real.sizeof==16) const char [] REALSIZE="16"; 
     382 
     383char [] storeVector(char type, int vecnum, char [] stride="") 
     384{ 
     385    if (type=='R') { 
     386        return "  fstp real ptr [" ~ vectorRegister(vecnum) ~ stride ~ "];"\n; 
     387    } else { 
     388        return "  fstp " ~ operandSize(type) ~ " [" ~ vectorRegister(vecnum) ~ " + " ~ vectorSize_LEA(type) ~ stride ~ "];"\n; 
     389    } 
     390} 
    372391 
    373392// Generate asm code which is optimal for x87 CPUs without SSE2 
     
    385404    bool firstvec=true; 
    386405    int vecnum = 0; 
     406    int scalarnum = 0; 
    387407    for (int i=0; i< typelist.length;++i) { 
    388408        if (typelist[i]=='S'){ 
    389             result~= "  auto var" ~ itoa(i) ~ " = expr[" ~ itoa(i) ~ "];\n"; 
     409            result~= "  real var" ~ itoa(i) ~ " = expr[" ~ itoa(i) ~ "];\n"; 
     410            ++scalarnum; 
    390411        } else { 
    391             result~= "  auto vec" ~ itoa(vecnum) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; 
     412            result~= "  auto vec" ~ itoa(i) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; 
    392413            if (typelist[i]=='R') { 
    393414                incrementRealVectors ~= "  add " ~ vectorRegister(vecnum) ~ ", " ~ REALSIZE ~ ";\n"; 
     
    401422    } 
    402423    assert(vecnum-1 < VectorRegisters.length, "Too many vectors!"); 
    403  
    404424    bool isDotProduct = (operations[$-1]=='.'); 
     425    int numScalarsOnStack=0; 
    405426 
    406427    result~= \n"asm {"\n ~ pushRegisters(vecnum) ~ 
    407428        "  mov ESI, veclength;"\n; // ESI will be the counter 
    408429 
    409         // Load all the vector pointers into registers. 
     430 
     431        // Load all the vector pointers into registers, and the scalars onto the stack 
    410432    int numvecs=0; 
     433    int numconsts=0; 
    411434    for (int i=0; i<typelist.length; ++i) { 
    412       if (isVector(typelist, i+'a')) { 
    413           if (isRealVector(typelist, i+'a')) { 
    414               result ~= "  mov " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa(numvecs) ~ ";"\n; 
     435      if (isVector(typelist[i])) { 
     436          if (typelist[i]=='R') { 
     437              result ~= "  mov " ~ vectorRegister(numvecs) ~ ", dword ptr vec" ~ itoa(i) ~ ";"\n; 
    415438          } else  { 
    416439            result ~= "  lea " ~ vectorRegister(numvecs) ~ ", [" 
    417               ~ vectorSize_LEA(typelist[i]) ~ "];"\n 
    418               ~ "  add " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa(numvecs) ~ ";"\n; 
     440              ~ vectorSize_LEA(typelist[i]) ~ "];   " 
     441              ~ "  add " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa(i) ~ ";"\n; 
    419442         } 
    420443        ++numvecs; 
     444      } else { 
     445          result ~= "  fld real ptr var"~ itoa(i) ~";\n"; 
     446          ++numconsts; 
     447          ++numScalarsOnStack; 
    421448      } 
    422449    } 
    423     if (isDotProduct) result ~= "  fldz;"\n; 
     450 
     451        if (isDotProduct) result ~= "  fldz;"\n; 
    424452    result ~= "  xor ESI, ESI; "\n 
    425453        "  sub ESI, veclength; // counter=-length"\n 
    426454        "  jz short L3; // test for length==0"\n; 
    427  
     455    if (!isDotProduct && operations.length==1 && finaloperation[0]=='*') { 
     456            ++numScalarsOnStack; 
     457            // load multiplier for *= 
     458            result ~= "  fld double ptr var0;\n"; 
     459    } 
    428460    int done=0; 
    429461 
    430462    char [] mainbody = ""; 
     463    char [] firstbody = ""; 
     464    int numOnStack = 0; // How much of the FP stack is being used? 
    431465 
    432466if (operations.length>1) { 
    433467    while(done<operations.length) { 
     468        char [] next; 
    434469      if (isInstruction(operations[done])) { 
    435             mainbody ~= "  " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 
     470            next = "  " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 
     471            mainbody ~= next; firstbody ~= next; 
    436472            ++done; 
     473            numOnStack--; 
    437474      } else if (!isInstruction(operations[done+1])){ 
    438475            int u  = operations[done]-'a'; 
    439             mainbody ~= "  fld "  ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 
     476            next = "  fld "  ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 
     477            mainbody ~= next; firstbody ~= next; 
    440478            ++done; 
    441       } else if (isVector(typelist, operations[done])) { 
    442          if (isRealVector(typelist, operations[done])) { 
     479            numOnStack++; 
     480      } else if (isVector(typelist[operations[done]-'a'])) { 
     481         if (typelist[operations[done]-'a']=='R') { 
    443482             // 80-bit vectors must be loaded onto the FPU stack first 
    444             mainbody ~= "  fld real ptr ["  ~ vectorRegister(vectorNum(typelist, operations[done])) ~ "];\n"; 
    445             mainbody ~= "  " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; 
     483            next = "  fld real ptr ["  ~ vectorRegister(vectorNum(typelist, operations[done])) ~ "];\n" 
     484                ~ "  " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; 
    446485         } else { 
    447             mainbody ~= "  " ~ opToX87(operations[done+1]) ~ " " 
     486            next = "  " ~ opToX87(operations[done+1]) ~ " " 
    448487              ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 
    449488        } 
     489        mainbody ~= next; firstbody ~= next; 
    450490        done +=2; 
    451491      } else { // multiply by scalar 
    452             mainbody ~= "  " ~ opToX87(operations[done+1]) ~ " double ptr var" ~ itoa(operations[done]-'a') ~";\n"; 
     492        firstbody ~= "  fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 
     493        mainbody ~= "  fmul ST, ST(" ~ itoa(1 + numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 
     494        // NOTE: For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 
     495           // next = "  " ~ opToX87(operations[done+1]) ~ " double ptr var" ~ itoa(operations[done]-'a') ~";\n"; 
     496//            mainbody ~= next; firstbody ~= next; 
    453497            done +=2; 
    454498      } 
    455499    } 
     500} else { 
     501    char [] next; 
     502    if (typelist[$-1]=='R') 
     503        next = "  fld real ptr ["  ~ vectorRegister(0) ~ "];\n"; 
     504    else next = "  fld "  ~ indexedVector(typelist, operations[0]) ~ ";\n"; 
     505    mainbody ~=next; firstbody~=next; 
     506    ++numOnStack; 
    456507} 
    457508    if (!isDotProduct && finaloperation.length>1) { 
    458         if (finaloperation[0]=='*') mainbody ="  fmul ST, ST(2);"\n; 
    459         else { 
     509        if (finaloperation[0]=='*') { 
     510            firstbody ~= "  fmul ST, ST(" ~ itoa(numOnStack) ~ ");"\n; 
     511            // +1 because previous result is also on stack 
     512            mainbody ~= "  fmul ST, ST(" ~ itoa(numOnStack+1) ~ ");"\n; 
     513        } else { 
    460514            char [] finalop = "fadd"; 
    461515            if (finaloperation[0]=='-') finalop="fsubr"; 
    462516             if (typelist[$-1]=='R') { 
    463517                 // 80-bit vectors must be loaded onto the FPU stack first 
    464                 mainbody ~= "  fld real ptr ["  ~ vectorRegister(numvecs-1) ~ " + " 
    465                 ~ vectorSize_LEA(typelist[$-1]) ~ "];"\n; 
     518                mainbody ~= "  fld real ptr ["  ~ vectorRegister(numvecs-1) ~ "];"\n; 
    466519                mainbody ~= "  " ~ finalop ~ "p ST(1), ST;\n"; 
    467520             } else { 
     
    471524        } 
    472525    } 
    473     result ~= \n ~  mainbody  ~ "  jmp short L2;\n" 
    474         ~ "L1:\n" ~ mainbody; 
     526    result ~= \n ~  firstbody  ~ "  jmp short L2;\n" 
     527        ~ "  align 4;\n" ~ "L1:\n" ~ mainbody; 
    475528 
    476529    result ~= "  fxch ST(1), ST;\n"; // get previous result 
    477530    if (isDotProduct) result ~= "  faddp ST(2), ST;"\n; 
    478     else result ~= "  fstp " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister(numvecs-1) ~ " + " ~ vectorSize_LEA(typelist[$-1]) ~ " - " ~ vectorSize(typelist[$-1]) ~ "];"\n; 
     531    else { 
     532        result ~= storeVector(typelist[$-1], numvecs-1, " - " ~ vectorSize(typelist[$-1])); 
     533    } 
    479534 
    480535    result ~= "L2: \n"; 
    481536 
    482537    result~= incrementRealVectors ~ "  inc ESI;\n  jnz L1;\n"; 
     538 
    483539    if (isDotProduct) result ~= "  faddp ST(1), ST;"\n; 
    484     else result ~= "  fstp " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister(numvecs-1) ~ " + " ~ vectorSize_LEA(typelist[$-1]) ~ "];"\n; 
     540    else result ~= storeVector(typelist[$-1], numvecs-1); 
     541 
     542        // Discard any scalars that are left on the stack. 
     543    if (isDotProduct && numScalarsOnStack>0) { 
     544        // Preserve the result of the dot product 
     545        result ~= "  fxch ST(" ~ itoa(numScalarsOnStack) ~ "), ST;"\n; 
     546    } 
     547    while (numScalarsOnStack>1) { 
     548        result~= "  fcompp ST(0), ST;"\n; 
     549        numScalarsOnStack-=2; 
     550    } 
     551    if (numScalarsOnStack==1) result~= "  fstp ST(0), ST;"\n; 
     552 
     553 
    485554    result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 
    486555    return result;