Changeset 89

Show
Ignore:
Timestamp:
03/22/07 04:50:33 (2 years ago)
Author:
Don Clugston
Message:

Blade now supports temporary-vector. Radical simplification -- combined VectorProxy? and VectorExpr?.
Added X87 optimisations to avoid fmul latency and perform real[] operations efficiently.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/mathextra/Blade.d

    r88 r89  
    44* Generate near-optimal x87 asm code for BLAS1 basic vector operations at compile time. 
    55* 32, 64 and 80 bit vectors are all supported. 
     6* Uses techniques described in Agner Fog's superb optimisation manual (www.agner.org). 
     7* 
     8* Author: 
     9*   Don Clugston. 
     10* License: 
     11*   Public domain. 
    612* 
    713* FEATURES: 
    814*  - Supports any mix of vector addition, subtraction, dot product, and multiplication 
    9 *    by a scalar. 
    10 *  - 80-bit precision is used whereever possible (and it does not reduce execturion speed). 
     15*    by a real scalar. 
     16*  - 80-bit precision is used whenever possible (and does not reduce execution speed). 
    1117*  - Supports mixed-length operations (eg, real[] + double[] + float[]). 
    1218*  - When static arrays are used, mismatches in array length are detected at compile time. 
     
    1723* and is faster than the unrolled solution originally published by Intel. 
    1824* 
    19 * BUGS
     25* BUGS/ FUTURE DIRECTIONS
    2026*  - Not well tested. No unit tests are included! 
    21 *  - Not always optimal for 80-bit vectors (can save an instruction by changing float[] + real[] into real[]+float[]). 
    2227*  - Not optimal for the case of multiple real vectors (they could share a counter). 
    2328*  - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required). 
     
    103108 
    104109/** 
    105  * A proxy for a temporary result of vector type. 
     110 * A proxy for a vector operation returning a vector type. 
    106111 * Stores all the arguments as a tuple, and the operations 
    107112 * as a character string. 
     113 * Note that if operations=="a", it's a single vector; otherwise, it's a temporary. 
     114 * 'knownlength' is the length of vector, when known at compile time. 
     115 * If 'knownlength' == 0, it is unknown at compile time. 
    108116 */ 
    109117struct VectorExpr(char[] operations, int knownlength, B...) { 
     
    122130        return q; 
    123131    } 
    124     static if (operations[$-2]=='*') { 
     132    static if (operations.length>1 && operations[$-2]=='*') { 
    125133        // Already a scalar multiply, so constant fold it. 
    126134        VectorExpr!(operations, knownlength, B) opMul(real x) { 
     
    138146        return JoinResult!("-", C.ops, C.ValueTuple)(values, x.values); 
    139147    } 
    140 
    141  
    142 private const char [] VectorProxyA = "a"; 
    143  
    144 /** 
    145  * A simple wrapper for a built-in array of type X. 
    146  * Allows vector addition and subtraction of *any* array type; we'll catch 
    147  * incompatibilities at the code generation step. 
    148  * if 'knownlength' == 0, it is unknown at compile time. 
    149  */ 
    150 struct VectorProxy(X, int knownlength) { 
    151     alias VectorProxyA ops; 
    152     alias knownlength len; 
    153     alias X[] ValueTuple; 
    154     X [] values; 
    155  
    156     static VectorProxy!(X, 0) opCall(dummy=void)(X [] vals) { VectorProxy!(X, 0) a; a.values=vals; return a; } 
    157     static VectorProxy!(X, Q) opCall(int Q)(X [Q] vals) { VectorProxy!(X, Q) a; a.values=vals; return a; } 
     148 
     149    // The opAssign operations are only valid for single vectors, not for temporaries 
     150  static if (operations=="a") { 
    158151    void opAssign(A)(A expr) { 
    159152        static assert(len==0 || expr.len == 0 || len == expr.len, "Vector lengths must match"); 
    160         performOperation!(expr.ops, "=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values); 
     153        performOperation!(expr.ops, "=", len==0? expr.len : len, expr.ValueTuple, B[0])(expr.values, values); 
    161154    } 
    162155    void opAddAssign(A)(A expr) { 
    163156        static assert(len==0 || expr.len == 0 || len == expr.len, "Vector lengths must match"); 
    164         performOperation!(expr.ops, "+=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values); 
     157        performOperation!(expr.ops, "+=", len==0? expr.len : len, expr.ValueTuple, B[0])(expr.values, values); 
    165158    } 
    166159    void opSubAssign(A)(A expr) { 
    167160        static assert(len==0 || expr.len == 0 || len == expr.len, "Vector lengths must match"); 
    168         performOperation!(expr.ops, "-=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values); 
     161        performOperation!(expr.ops, "-=", len==0? expr.len : len, expr.ValueTuple, B[0])(expr.values, values); 
    169162    } 
    170163    void opMulAssign(A)(A w) { // Use a template, because we don't want to generate this code 
    171164                               // unless it's actually used. 
    172165        static assert(is(A: real)); 
    173         performOperation!("a", "*=", knownlength, real, X[])(w, values); 
    174     } 
    175     VectorExpr!("a*b", knownlength, X[], real) opMul(real w) { 
    176         return VectorExpr!("a*b", knownlength, X[], real)(values, w); 
    177     } 
    178     VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[]) opAdd(int T, Q)(Q[T] w) { 
    179         static assert(T==0 || knownlength==0 || T==knownlength); 
    180         return VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[])(values, w); 
    181     } 
    182     VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[]) opAdd(Q, int T)(VectorProxy!(Q, T) w) { 
    183         static assert(T==0 || knownlength==0 || T==knownlength); 
    184         return VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[])(values, w.values); 
    185     } 
    186     VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[]) opSub(int T, Q)(Q[T] w) { 
    187         static assert(T==0 || knownlength==0 || T==knownlength); 
    188         return VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[])(values, w); 
    189     } 
    190     VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[]) opSub(Q, int T)(VectorProxy!(Q, T) w) { 
    191         static assert(T==0 || knownlength==0 || T==knownlength); 
    192         return VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[])(values, w.values); 
    193     } 
    194 
    195  
    196 VectorProxy!(X, 0) Vec(X)(X[] vals) { return VectorProxy!(X, 0)(vals); } 
    197 VectorProxy!(X, Q) Vec(X, int Q)(X[Q] vals) { return VectorProxy!(X, Q)(vals); } 
     166        performOperation!("a", "*=", knownlength, real, B[0])(w, values); 
     167    } 
     168  } 
     169
     170 
     171 
     172VectorExpr!("a", Q, X[]) Vec(X, int Q)(X[Q] vals) { VectorExpr!("a", Q, X[]) a; a.values[0]=vals; return a; } 
     173VectorExpr!("a", 0, X[]) Vec(X)(X[] vals) { VectorExpr!("a", 0, X[]) a; a.values[0]=vals; return a; } 
    198174 
    199175real dot(A, B)(A a, B b) 
     
    206182real performOperation(char [] operations, char [] finaloperation, int knownlength, X...)(X expr) 
    207183{ 
    208      pragma(msg, finaloperation ~ operations); 
    209      const char [] post = makePostfix(operations); 
     184     pragma(msg, finaloperation ~ " " ~ operations); 
    210185     const char [] tupstr = vectorTupleToString!(X); 
     186     const char [] post = makePostfixForX87(operations, tupstr); 
    211187     pragma(msg, post); 
    212188     pragma(msg, tupstr); 
    213189    static if (knownlength!=0) pragma(msg, "Length is known!"); 
    214      const char [] qqq = makeAsmX87(vectorTupleToString!(X), makePostfix(operations), finaloperation); 
     190     const char [] qqq = makeAsmX87(tupstr, makePostfixForX87(operations, tupstr), finaloperation); 
     191     const char [] qqq2 = makeAsmX87(tupstr, makePostfix(operations), finaloperation); 
    215192     pragma(msg, qqq); 
    216193 
    217     mixin(makeAsmX87(vectorTupleToString!(X), makePostfix(operations), finaloperation)); 
     194    mixin(makeAsmX87(vectorTupleToString!(X), makePostfixForX87(operations, tupstr), finaloperation)); 
    218195} 
    219196 
     
    249226} 
    250227 
     228// Apply x87-specific optimisations while converting to postfix 
     229char [] makePostfixForX87(char [] operations, char [] typelist) 
     230{ 
     231    if (operations.length==1) return operations; 
     232 
     233    int x = exprLength(operations); 
     234    char [] first = operations[0..x+1]; 
     235    char [] second = operations[x+2..$]; 
     236    if (first[0]=='(') { 
     237        first = makePostfixForX87(first[1..$-1], typelist); 
     238    } 
     239    if (second[0]=='(') { 
     240        second = makePostfixForX87(second[1..$-1], typelist); 
     241    } 
     242    // x87 OPTIMISATION #1 
     243    // On x87, fmul has a long latency, so we want to delay using the 
     244    // result of a multiply. Since + is commutative, we can achieve this 
     245    // by calculating the value with the multiply, before the other one. 
     246    if (operations[x+1]=='+') { 
     247        if (second[$-1]=='*' && first[$-1]!='*') { 
     248           return second ~ first ~ operations[x+1..x+2]; 
     249        } 
     250    } 
     251    // We can also do the same thing with -, but we need to use fsubr 
     252    // instead of fsub. 
     253    if (operations[x+1]=='-') { 
     254        if (second[$-1]=='*' && first[$-1]!='*') { 
     255           return second ~ first ~ "~"; 
     256        } 
     257    } 
     258    // x87 OPTIMISATION #2 
     259    // When an operation is performed between a real[] and a non-real[], 
     260    // we want to have the real[] being the one which is loaded first. 
     261    if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='+') { 
     262           return second ~ first ~ "+"; 
     263    } 
     264    if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='-') { 
     265           return second ~ first ~ "~"; 
     266    } 
     267    return first ~ second ~ operations[x+1..x+2]; 
     268} 
     269 
    251270template singleType(A) 
    252271{ 
     
    268287// ------------------------------- 
    269288 
    270  
    271289bool isInstruction(char op) 
    272290{ 
    273     return (op=='+' || op=='*' || op=='-'|| op=='.'); 
    274 
    275  
     291    return (op=='+' || op=='*' || op=='-'|| op=='.'|| op=='~'); 
     292
    276293 
    277294bool isVector(char var) 
    278295{ 
    279296    return (var=='R' || var=='D' || var=='F'); 
    280 } 
    281  
    282 bool isRealVector(char [] typelist, char var) 
    283 { 
    284     return typelist[var-'a']=='R'; 
    285297} 
    286298 
     
    304316    else if (var == 'F') return "float ptr "; 
    305317    else if (var == 'S') return "(scalar)"; 
    306     else return "(???)"; 
    307318} 
    308319 
     
    312323    else if (op=='+') return "fadd"; 
    313324    else if (op=='-') return "fsub"; 
    314 
    315  
    316 char [] vectorSize_LEA(char var) 
    317 
    318     if (var=='D') return "8*ESI"; 
    319     else if (var=='F') return "4*ESI"; 
    320     else return "ESI"; 
     325    else if (op=='~') return "fsubr"; 
    321326} 
    322327 
     
    326331    else if (vartype=='F') return "4"; 
    327332    else if (vartype=='R') return REALSIZE; 
    328     assert(0); 
    329333} 
    330334 
     
    349353// First, use the scratch registers (EAX, ECX, EDX). If that's not enough, 
    350354// use EBX, ESI, and EDI. Finally, use the frame register EBP. 
    351 const char [][5] VectorRegisters = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 
    352  
    353 char [] vectorRegister(int vecnum) 
    354 
    355     return VectorRegisters[vecnum]; 
    356 
     355const char [][5] vectorRegister = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 
    357356 
    358357char [] pushRegisters(int numVectors) 
    359358{ 
    360359    char [] result = "  push ESI;"; 
    361     for (int i=3; i<numVectors; ++i) result~= " push " ~ VectorRegisters[i] ~ ";"; 
     360    for (int i=3; i<numVectors; ++i) result~= " push " ~ vectorRegister[i] ~ ";"; 
    362361    return result ~ "\n"; 
    363362} 
     
    366365{ 
    367366    char [] result = "  "; 
    368     for (int i=numVectors-1; i>=3; --i) result~= "pop " ~ VectorRegisters[i] ~ "; "; 
     367    for (int i=numVectors-1; i>=3; --i) result~= "pop " ~ vectorRegister[i] ~ "; "; 
    369368    return result ~ "pop ESI;\n"; 
    370369} 
     
    372371char [] indexedVector(char [] typelist, char var) 
    373372{ 
    374     if (typelist[var-'a']=='R') return " real ptr [" ~ vectorRegister(vectorNum(typelist, var)) ~ "]"; 
     373    if (typelist[var-'a']=='R') return " real ptr [" ~ vectorRegister[vectorNum(typelist, var)] ~ "]"; 
    375374    return operandSize(typelist[var-'a']) ~ "[" ~ 
    376             vectorRegister(vectorNum(typelist, var)) ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; 
     375            vectorRegister[vectorNum(typelist, var)] ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; 
    377376} 
    378377 
     
    384383{ 
    385384    if (type=='R') { 
    386         return "  fstp real ptr [" ~ vectorRegister(vecnum) ~ stride ~ "];"\n; 
     385        return "  fstp real ptr [" ~ vectorRegister[vecnum] ~ stride ~ "];"\n; 
    387386    } else { 
    388         return "  fstp " ~ operandSize(type) ~ " [" ~ vectorRegister(vecnum) ~ " + " ~ vectorSize_LEA(type) ~ stride ~ "];"\n; 
     387        return "  fstp " ~ operandSize(type) ~ " [" ~ vectorRegister[vecnum] ~ " + " ~ vectorSize(type)~ "*ESI" ~ stride ~ "];"\n; 
    389388    } 
    390389} 
     
    412411            result~= "  auto vec" ~ itoa(i) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; 
    413412            if (typelist[i]=='R') { 
    414                 incrementRealVectors ~= "  add " ~ vectorRegister(vecnum) ~ ", " ~ REALSIZE ~ ";\n"; 
     413                incrementRealVectors ~= "  add " ~ vectorRegister[vecnum] ~ ", " ~ REALSIZE ~ ";\n"; 
    415414            } 
    416415            if (firstvec) { 
     
    421420        } 
    422421    } 
    423     assert(vecnum-1 < VectorRegisters.length, "Too many vectors!"); 
     422    assert(vecnum-1 < vectorRegister.length, "Too many vectors!"); 
    424423    bool isDotProduct = (operations[$-1]=='.'); 
    425424    int numScalarsOnStack=0; 
     
    427426    result~= \n"asm {"\n ~ pushRegisters(vecnum) ~ 
    428427        "  mov ESI, veclength;"\n; // ESI will be the counter 
    429  
    430428 
    431429        // Load all the vector pointers into registers, and the scalars onto the stack 
     
    435433      if (isVector(typelist[i])) { 
    436434          if (typelist[i]=='R') { 
    437               result ~= "  mov " ~ vectorRegister(numvecs) ~ ", dword ptr vec" ~ itoa(i) ~ ";"\n; 
     435              result ~= "  mov " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; 
    438436          } else  { 
    439             result ~= "  lea " ~ vectorRegister(numvecs) ~ ", [" 
    440               ~ vectorSize_LEA(typelist[i]) ~ "];   " 
    441               ~ "  add " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa(i) ~ ";"\n; 
     437            result ~= "  lea " ~ vectorRegister[numvecs] 
     438              ~ ", [" ~ vectorSize(typelist[i]) ~ "*ESI];   " 
     439              ~ "  add " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; 
    442440         } 
    443441        ++numvecs; 
     
    481479         if (typelist[operations[done]-'a']=='R') { 
    482480             // 80-bit vectors must be loaded onto the FPU stack first 
    483             next = "  fld real ptr ["  ~ vectorRegister(vectorNum(typelist, operations[done])) ~ "];\n" 
     481            next = "  fld real ptr ["  ~ vectorRegister[vectorNum(typelist, operations[done])] ~ "];\n" 
    484482                ~ "  " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; 
    485483         } else { 
     
    489487        mainbody ~= next; firstbody ~= next; 
    490488        done +=2; 
    491       } else { // multiply by scalar 
     489      } else { // multiply by scalar. Note that there's an extra item on the stack when we're in the body of the loop. 
    492490        firstbody ~= "  fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 
    493491        mainbody ~= "  fmul ST, ST(" ~ itoa(1 + numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 
     
    501499    char [] next; 
    502500    if (typelist[$-1]=='R') 
    503         next = "  fld real ptr ["  ~ vectorRegister(0) ~ "];\n"; 
     501        next = "  fld real ptr ["  ~ vectorRegister[0] ~ "];\n"; 
    504502    else next = "  fld "  ~ indexedVector(typelist, operations[0]) ~ ";\n"; 
    505503    mainbody ~=next; firstbody~=next; 
     
    516514             if (typelist[$-1]=='R') { 
    517515                 // 80-bit vectors must be loaded onto the FPU stack first 
    518                 mainbody ~= "  fld real ptr ["  ~ vectorRegister(numvecs-1) ~ "];"\n; 
     516                mainbody ~= "  fld real ptr ["  ~ vectorRegister[numvecs-1] ~ "];"\n; 
    519517                mainbody ~= "  " ~ finalop ~ "p ST(1), ST;\n"; 
    520518             } else { 
    521                 mainbody ~= "  " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister(numvecs-1) ~ " + " 
    522                 ~ vectorSize_LEA(typelist[$-1]) ~ "];"\n; 
     519                mainbody ~= "  " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " 
     520                ~ vectorSize(typelist[$-1]) ~ "*ESI];"\n; 
    523521            } 
    524522        } 
     
    565563    auto q = Vec([3.5L, 1.1, 3.8]); 
    566564    auto r = Vec([17.0f, 28.1, 1]); 
    567     q -= ((p+r)*18.0L*314.1L - (p-r))* 35; 
     565    q -= ((r+p)*18.0L*314.1L - (p-r))* 35; 
    568566    real d = dot(r, p+r+r); 
    569567    writefln(d); 
    570     p*=2; 
    571 
     568    p = r - q*2.0; 
     569    p*=5.6L; // currently fails 
     570