Changeset 126

Show
Ignore:
Timestamp:
11/02/07 03:08:28 (10 months ago)
Author:
Don Clugston
Message:

Added SSE1 support.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/blade/Blade.d

    r125 r126  
    1515*  - Supports any mix of vector addition, subtraction, dot product, and multiplication 
    1616*    by a scalar. 
    17 *  - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 
    18 *    the expression, and the availability of inline asm. 
     17*  - Generates either x87 asm code, SSE or SSE2 asm code or pure D, depending on 
     18*    the complexity of the expression, and the availability of inline asm. 
    1919*  - When static arrays are used, mismatches in array length are detected 
    2020*    at compile time. 
     
    2323* 
    2424* FUTURE DIRECTIONS (in order of expected implementation): 
    25 * - SSE1 code could be used for float operations. 
    2625* - Dot product (which was present in BLADE 0.2). 
    2726* - The x87 code generation targets early Pentiums, which are now irrelevant. 
     
    171170} 
    172171 
    173 /// Generate code which will call the SSE2 function 
    174 char [] invokeSSE2(AbstractSyntaxTree tree) 
     172/// Generate code which will call the SSE/SSE2 code generation function 
     173char [] invokeSSE(bool SSE2, AbstractSyntaxTree tree) 
    175174{ 
    176175    char [] result = assertAllVectorLengthsEqual(tree); 
     176    result ~= assertAllVectorsAlign128(tree); 
    177177     
    178     result ~= "SSEVECGEN!(2," ~ wrapInQuotes(tree.expression);     
     178    result ~= "SSEVECGEN!(" ~ (SSE2?"2":"1") ~ "," ~ wrapInQuotes(tree.expression);     
    179179    // For SSE2, everything must be implicitly convertible to double. 
    180180    for (int i=0; i<tree.symbolTable.length;++i) { 
    181         if (tree.symbolTable[i].rank==0) result ~= ",double"; 
    182         else result ~= ",double*"; 
     181        if (SSE2) { 
     182            if (tree.symbolTable[i].rank==0) result ~= ",double"; 
     183            else result ~= ",double*"; 
     184        } else { 
     185            if (tree.symbolTable[i].rank==0) result ~= ",float"; 
     186            else result ~= ",float*"; 
     187        } 
    183188    } 
    184189    result ~= ")("; 
     
    219224} 
    220225 
     226char [] assertAllVectorsAlign128(AbstractSyntaxTree tree) 
     227{ 
     228    char [] result =""; 
     229    for (int i=0; i<tree.symbolTable.length;++i) { 
     230        if (tree.symbolTable[i].rank==1){ 
     231            result ~= "assert( (cast(size_t)(" ~ tree.symbolTable[i].value  
     232                    ~ ".ptr)& 0x1F) == 0, `SSE Vector misalignment: " ~ tree.symbolTable[i].value ~ "`);"\n; 
     233        } 
     234    } 
     235    return result; 
     236} 
     237 
    221238// Return true if the type has a length which is known at compile time 
    222239bool arrayLengthIsStatic(char [] type) 
     
    263280{ 
    264281    VecExpressionType exprType = categorizeExpression(tree); 
    265     if (exprType == VecExpressionType.SSE2Expression) { 
    266         return invokeSSE2(tree); 
     282    if (exprType == VecExpressionType.SSE2Expression || exprType == VecExpressionType.SSE1Expression) { 
     283        return invokeSSE((exprType == VecExpressionType.SSE2Expression), tree); 
    267284    } else if (exprType == VecExpressionType.X87Expression) { 
    268285        return invokeX87(tree); 
  • trunk/blade/BladeDemo.d

    r125 r126  
    1010 
    1111// Unfortunately static arrays in D aren't aligned to 128-bit boundaries yet. 
    12 // This causes segfaults if trying to use SSE. For now, only use heap arrays 
    13 // with SSE/SSE2. 
     12// In such cases, the library generates an 'SSE misalignment' assert error, 
     13// to avoid segfaults. 
     14// For now, only use heap arrays with SSE/SSE2. 
    1415     
    1516void main() 
     
    2223    d[0..$] = [17.0, 28.25, 1, 56.2];; 
    2324    auto w2=z.ptr; 
    24     float[] q = [17.0f, 28.25, 1, 0]; 
     25    float[] q = new float[4]; 
     26    q[0..$]= [17.0f, 28.25, 1, 0]; 
    2527    float [4] r; 
    2628    idouble [] p = [2.3i, 254i, 0.1i, 1.2i]; 
     
    3234    mixin(vectorize(" a   += d*2.01")); 
    3335    mixin(vectorize(" a   += r*2.01")); 
     36    mixin(vectorize(" q   += q*2.01")); 
    3437     
    3538    writefln("a=", a); 
    36     double c = 1.0i * 2.0i; 
    37     idouble w = cast(idouble)c; 
    38     pragma(msg, typeof(c).stringof); 
    3939} 
  • trunk/blade/CodegenX86.d

    r125 r126  
    33* BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 
    44* 
    5 * Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. 
     5* Generate near-optimal x87/SSE/SSE2 asm code for BLAS1 basic vector operations 
     6* at compile time. 
    67* 32, 64 and 80 bit vectors are all supported. 
    78* Uses techniques described in Agner Fog's superb Pentium optimisation manual (www.agner.org). 
     
    1516*  - Supports any mix of vector addition, subtraction, dot product, and multiplication 
    1617*    by a scalar. 
    17 *  - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 
    18 *    the expression, and the availability of inline asm. 
     18*  - Generates either x87, SSE, or SSE2 asm code. 
    1919*  - If x87 code is generated, 80-bit precision is used whenever possible. 
    2020*  - Supports mixed-length operations (eg, real[] + double[] + float[]). 
    2121* 
    22 * 
    2322* BUGS/ FUTURE DIRECTIONS: 
     23*  None of these support dot product, or matrix operations. 
     24* X87: 
    2425* The x87 code generation targets early Pentiums, which are now irrelevant. 
    2526* It needs to be updated to PM/Core2 (this will significantly simplify it). 
     
    3031*  - Doesn't use EBP register -- this would allow an extra vector in expressions. 
    3132*   (to do this, need naked asm with no stack frame). 
     33* SSE/SSE2: 
     34*  - SSE functions don't support unaligned data. Need to generate seperate code  
     35*    for that case (NOTE: probably only worth doing for small expressions). 
    3236* 
    3337* THEORY: 
     
    230234    return ['*':"fmul"[], '+': "fadd", '-': "fsub", '_': "fsubr"]; } 
    231235 
    232 char [][char] opToSSE2() { 
    233     return ['*':"mulpd"[], '+': "addpd", '-': "subpd", '/': "divpd"]; } 
    234  
    235236char [][char] opToSSE() { 
    236     return ['*':"mulps"[], '+': "addps", '-': "subps", '/': "divps"]; } 
    237  
     237    return ['*':"mulp"[], '+': "addp", '-': "subp", '/': "divp"]; } 
     238 
     239char [][char] opToSSESingle() { 
     240    return ['*':"muls"[], '+': "adds", '-': "subs", '/': "divs"]; } 
    238241 
    239242static if (real.sizeof==10)      const char [] REALSIZE = "10"; 
     
    298301} 
    299302 
    300 char [] indexedSSEVector(int [] ranklist, char var) 
    301 
    302     return "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ " + 8*EAX]"; 
     303char [] indexedSSEVector(int [] ranklist, char var, char [] vecsize) 
     304
     305    return "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ " + " ~ vecsize ~"*EAX]"; 
     306
     307 
     308char [] indexedSSENext(int [] ranklist, char var, char [] vecsize) 
     309
     310    return "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ " + " ~ vecsize ~"*EAX+16]"; 
    303311} 
    304312 
     
    509517public: 
    510518 
    511 /** Generate BLAS1 asm code which is optimal for CPUs with SSE2. 
    512  * We don't need types for SSE2. Everything is a double; vectors are double* 
     519/** Generate BLAS1 asm code using SSE or SSE2. 
     520 * For SSE2, all scalars are double, vectors are double*; for SSE1, all are float. 
     521 * At entry, all vector parameters are aligned. 
    513522 */ 
    514523char [] generateCodeForSSE(int SSEVer, int [] ranklist, char [] infixOperations, char cumulatingOp=0) 
     
    528537 
    529538    char [] vectorsize = (SSEVer == 2) ? "8" :"4"; // size of a double 
     539    char [] suffix = (SSEVer == 2) ? "d " :"s "; 
    530540     
    531541    int vecregnum = 0; 
     
    540550      } else if (ranklist[i]==0) { 
    541551          // load scalar into an XMM register, and then duplicate it into both 
    542           // halves of the register using shufpd. 
    543           result ~= "  movsd " ~ XMM(numconsts) ~ ", values["~ itoa(i) ~"]; " 
    544             "  shufpd " ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; 
     552          // halves (or quarters) of the register using shufpd.           
     553          result ~= "  movs" ~ suffix ~ XMM(numconsts) ~ ", values[" ~ itoa(i) ~ "]; " 
     554            "  shufp" ~ suffix ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; 
    545555          ++numconsts; 
    546556          ++numScalarsOnStack; 
     
    549559    result ~= "  xor EAX, EAX; "\n 
    550560        "  sub EAX, veclength; // counter=-length"\n 
    551         "  jz short L3; // test for length==0"\n; 
     561        "  jz short L2; // test for length==0"\n; 
    552562    int done=0; 
    553563 
    554564    char [] mainbody = ""; 
     565    char [] extra = ""; // for the extra length%4 calculations 
    555566 
    556567    // The SSE implementation mimics the x87 version. Instead of keeping track of 
     
    561572    int numOnStack = numScalarsOnStack; // How much of the FP stack is being used? 
    562573    while(done<operations.length) { 
    563         char [] next; 
     574      char [] comment; 
    564575      if (isInstruction(operations[done])) { 
    565576            // Perform an arithmetic operation on the top two items. 
    566             next = "  " ~ opToSSE2[operations[done]] ~ XMM(numOnStack-1) ~ ", " ~ XMM(numOnStack) ~ ";  //" ~ operations[done] ~ \n; 
    567             mainbody ~= next; 
     577            comment = ";  //" ~ operations[done] ~ \n; 
     578            mainbody ~= "  " ~ opToSSE[operations[done]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(numOnStack) ~ comment; 
     579            extra ~= "  " ~ opToSSESingle[operations[done]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(numOnStack) ~ comment; 
    568580            ++done; 
    569581            numOnStack--; 
    570582      } else if (!isInstruction(operations[done+1])){ 
    571583            // load a vector onto the FPU stack, to begin a new subexpression. 
    572             int u  = operations[done]-'A'
    573             next = "  movapd " ~ XMM(numOnStack) ~ ", " ~ indexedSSEVector(ranklist, operations[done] ) ~ ";  // " ~ operations[done..done+1] ~ \n
    574             mainbody ~= next; 
     584            comment = ";  // " ~ operations[done] ~ \n
     585            mainbody ~= "  movap" ~ suffix ~ XMM(numOnStack) ~ ", " ~ indexedSSEVector(ranklist, operations[done], vectorsize) ~ comment
     586            extra ~= "  movs" ~ suffix ~ XMM(numOnStack) ~ ", " ~ indexedSSENext(ranklist, operations[done], vectorsize) ~ comment; 
    575587            ++done; 
    576588            numOnStack++; 
    577589      } else if (ranklist[operations[done]-'A']==1) { 
    578590             // An operation will be performed between the stack top and a vector. 
    579              // If it's a float or double, we can combine the load+arithmetic op 
    580              // into a single instruction. 
    581             if (operations[done+1]=='=') mainbody ~= "  movapd " ~ indexedSSEVector(ranklist, operations[$-2] ) ~ ", XMM" ~ itoa(numOnStack-1) ~";  // " ~ operations[$-2..$] ~ \n; 
    582             else mainbody ~= "  " ~ opToSSE2[operations[done+1]] ~ " " ~ XMM(numOnStack-1) ~ ", " 
    583               ~ indexedSSEVector(ranklist, operations[done] ) ~ "; // " ~ operations[done..done+2] ~ \n; 
     591            comment = ";  // " ~ operations[done..done+2] ~ \n; 
     592            if (operations[done+1]=='=') { 
     593                 mainbody ~= "  movap" ~ suffix ~ indexedSSEVector(ranklist, operations[$-2], vectorsize) ~ ", XMM" ~ itoa(numOnStack-1) ~ comment; 
     594                 extra ~= "  movs" ~ suffix ~ indexedSSENext(ranklist, operations[$-2], vectorsize) ~ ", XMM" ~ itoa(numOnStack-1) ~ comment; 
     595            } else { 
     596                mainbody ~= "  " ~ opToSSE[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 
     597                    ~ indexedSSEVector(ranklist, operations[done], vectorsize) ~ comment; 
     598                extra ~= "  " ~ opToSSESingle[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 
     599                    ~ indexedSSENext(ranklist, operations[done], vectorsize) ~ comment; 
     600            } 
    584601            done+=2; 
    585602      } else { // multiply by scalar. 
    586             next = "  " ~ opToSSE2[operations[done+1]] ~ " " ~ XMM(numOnStack-1) ~ ", " ~ XMM(scalarNum(ranklist, operations[done]-'A')) ~"; // " ~operations[done..done+2] ~ \n; 
    587             mainbody ~= next;        
    588         done +=2; 
     603            comment = "; // " ~operations[done..done+2] ~ \n; 
     604            int u = scalarNum(ranklist, operations[done]-'A'); 
     605            mainbody ~= "  " ~ opToSSE[operations[done+1]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(u) ~ comment; 
     606            extra ~= "  " ~ opToSSESingle[operations[done+1]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(u) ~ comment; 
     607            done +=2; 
    589608      }       
    590609    } 
     
    593612        ~ "  align 16;\n"  
    594613        ~ "L1:\n" ~ mainbody; 
    595     result ~= "  add EAX,2;\n" ~ "  js L1;\n"; 
    596     result~= "L3:" \n ~ popRegisters(numvecs) ~ "}\r\n"; 
     614    if (SSEVer == 2) { 
     615        result ~= "  add EAX,2;\n  js L1;\n" 
     616             ~ "L2:\n  sub EAX, 2;\n  jns L4;\n" 
     617            // Now the calculations for the final double, if any. 
     618             ~ extra; 
     619    } else { 
     620        result ~= "  add EAX,4;\n" ~ "  js L1;\n" 
     621            ~ "L2:\n  sub EAX, 4;\n  jns L4;\n" 
     622        // Now the extra calculations for the 0-3 float, or 0-1 doubles 
     623            ~ "L3:"\n ~ extra 
     624            ~ "  add EAX,1;\n  js L3;\n"; 
     625    } 
     626    result~= "L4:" \n ~ popRegisters(numvecs) ~ "}\n"; 
    597627    
    598628    return result;