Changeset 125

Show
Ignore:
Timestamp:
11/01/07 09:01:25 (10 months ago)
Author:
Don Clugston
Message:

Asm functions now only get the vector length passed once. This results in optimal parameter passing, and also makes naked asm possible, freeing up the EBP register. Bumped version number to 0.4 in response.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/blade/Blade.d

    r124 r125  
    11//  Written in the D programming language 1.0 
    22/** 
    3 * BLADE 0.3Alpha -- Basic Linear Algebra D Expressions 
     3* BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 
    44* 
    55* Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. 
     
    1717*  - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 
    1818*    the expression, and the availability of inline asm. 
    19 *  - 80-bit precision is used whenever possible. 
    20 *  - Supports mixed-length operations (eg, real[] + double[] + float[]). 
    21 * 
    22 * TWO STEPS FORWARD, ONE STEP BACK: 
    23 *  The previous version supported additional functionality. This has not yet been 
    24 *  added after refactoring. 
    25 *  - Supports real, complex and imaginary vectors, and detects type mismatches between them. 
    2619*  - When static arrays are used, mismatches in array length are detected 
    2720*    at compile time. 
    28 * 
    29 * BUGS/ FUTURE DIRECTIONS: 
    30 * The x87 code generation targets early Pentiums, which are now irrelevant. 
    31 * It needs to be updated to PM/Core2 (this will significantly simplify it). 
    32 * - Not optimal for the case of multiple real vectors (they could share a counter)
    33 * - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required). 
    34 * - Doesn't take full advantage of length being known at compile time (loop unrolling 
    35 *     is possible)
    36 * - Doesn't use EBP register -- this would allow an extra vector in expressions. 
    37 *   (to do this, need naked asm with no stack frame)
     21*  - Error messages refer to the line of user code which generated the error. 
     22*    The library never produces a torrent of undecipherable template error messages. 
     23* 
     24* FUTURE DIRECTIONS (in order of expected implementation): 
     25* - SSE1 code could be used for float operations
     26* - Dot product (which was present in BLADE 0.2). 
     27* - The x87 code generation targets early Pentiums, which are now irrelevant. 
     28* - A 'const folding' (actually vector/scalar folding) step needs to be performed
     29* - Support for strided vectors. 
     30* - Matrix support
    3831* 
    3932* THEORY: 
     
    4841* 0.2 - Support for a wider variety of expressions. Dot product, imaginary numbers, etc. 
    4942* 0.3 - Based on string mixins. Most of the new features of 0.2 are gone, but SSE2 is added. 
     43* 0.4 - Added D code generator. Nice error messages. Optimal parameter passing. 
    5044*/ 
    5145 
     
    5549private import blade.BladeUtil : wrapInQuotes, startsWith; 
    5650private import blade.BladeRank; 
    57 public import blade.CodegenX86 : generateCodeForAsmX87, generateCodeForSSE2, MAX_X87_VECTORS, MAX_SSE_VECTORS; 
     51public import blade.CodegenX86 : generateCodeForAsmX87, generateCodeForSSE, MAX_X87_VECTORS, MAX_SSE_VECTORS; 
    5852 
    5953public: 
     
    6458// we must re-assemble the type information, and use this to generate the asm code. 
    6559 
    66 /** Function to implement BLAS1 operations using SSE2 assembler. 
    67  * Every member of the Values tuple must only be double or double []
     60/** Function to implement BLAS1 operations using SSE/SSE2 assembler. 
     61 * Every member of the Values tuple must only be double or double *
    6862 */ 
    69 void SSEVECGEN(char [] expr, Values...)(Values values) { 
    70     const ranklist = TupleRank!(Values);     
    71     pragma(msg, generateCodeForSSE2(ranklist, expr)); 
    72     mixin(generateCodeForSSE2(ranklist, expr)); 
     63void SSEVECGEN(int SSEVersion, char [] expr, Values...)(int veclength, Values values) { 
     64    const ranklist = TupleRank!(Values); 
     65    pragma(msg, generateCodeForSSE(SSEVersion, ranklist, expr)); 
     66    mixin(generateCodeForSSE(SSEVersion, ranklist, expr)); 
    7367 } 
    7468 
     
    7670 * Every member of the Values tuple must only be real, float[], double [], or real[]. 
    7771 */ 
    78 void X87VECGEN(char [] rawexpr, Values...)(Values values) { 
     72void X87VECGEN(char [] rawexpr, Values...)(int veclength, Values values) { 
    7973    const typelist = elementTupleToString!(Values); 
    8074    const ranklist = TupleRank!(Values); 
     
    161155            else result ~= "double"; // Convert all other scalars into doubles. 
    162156         } 
    163         else if (startsWith(t, "real[")) result ~= ",real[]"; 
    164         else if (startsWith(t, "float[")) result ~= ",float[]"; 
    165         else if (startsWith(t, "double[")) result~= ",double[]"; 
     157        else if (startsWith(t, "real[")) result ~= ",real*"; 
     158        else if (startsWith(t, "float[")) result ~= ",float*"; 
     159        else if (startsWith(t, "double[")) result~= ",double*"; 
    166160        // else error. 
    167161    } 
    168162    result ~= ")("; 
    169     int knt=0; 
    170     for (int i=0; i<tree.symbolTable.length;++i) { 
    171         if (knt>0) result ~=","; 
    172         result ~= tree.symbolTable[i].value; 
    173         ++knt; 
     163    int firstVector = findVectorForLength(tree); 
     164    result ~= tree.symbolTable[firstVector].value ~ ".length"; 
     165    for (int i=0; i<tree.symbolTable.length;++i) { 
     166        result ~="," ~ tree.symbolTable[i].value; 
     167        // for vectors, we only need the pointer, not the length 
     168        if (tree.symbolTable[i].rank==1) result ~= ".ptr"; 
    174169    } 
    175170    return result~ ");";         
     
    181176    char [] result = assertAllVectorLengthsEqual(tree); 
    182177     
    183     result ~= "SSEVECGEN!(" ~ wrapInQuotes(tree.expression);         
     178    result ~= "SSEVECGEN!(2," ~ wrapInQuotes(tree.expression);     
    184179    // For SSE2, everything must be implicitly convertible to double. 
    185180    for (int i=0; i<tree.symbolTable.length;++i) { 
    186181        if (tree.symbolTable[i].rank==0) result ~= ",double"; 
    187         else result ~= ",double[]"; 
     182        else result ~= ",double*"; 
    188183    } 
    189184    result ~= ")("; 
    190     int knt=0; 
    191     for (int i=0; i<tree.symbolTable.length;++i) { 
    192         if (knt>0) result ~=","; 
    193         result ~= tree.symbolTable[i].value; 
    194         ++knt; 
     185    int firstVector = findVectorForLength(tree); 
     186    result ~= tree.symbolTable[firstVector].value ~ ".length"; 
     187 
     188    for (int i=0; i<tree.symbolTable.length;++i) { 
     189        result ~= "," ~ tree.symbolTable[i].value; 
     190        // for vectors, we only need the pointer, not the length 
     191        if (tree.symbolTable[i].rank==1) result ~= ".ptr"; 
    195192    } 
    196193    return result ~ ");"; 
  • trunk/blade/BladeDemo.d

    r123 r125  
    3434     
    3535    writefln("a=", a); 
     36    double c = 1.0i * 2.0i; 
     37    idouble w = cast(idouble)c; 
     38    pragma(msg, typeof(c).stringof); 
    3639} 
  • trunk/blade/BladeRank.d

    r119 r125  
    7272// -------------- 
    7373// Ranklist functions 
    74  
    75 int findFirstVector(int [] ranklist) 
    76 { 
    77     for (int i=0; i< ranklist.length;++i) { 
    78         if (ranklist[i]==1) return i; 
    79     } 
    80     return 0; 
    81 } 
    82  
    8374 
    8475// Count the number of vectors 
  • trunk/blade/CodegenX86.d

    r123 r125  
    11//  Written in the D programming language 1.0 
    22/** 
    3 * BLADE 0.3Alpha -- Basic Linear Algebra D Expressions 
     3* BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 
    44* 
    55* Generate near-optimal x87/SSE2 asm code for BLAS1 basic vector operations at compile time. 
     
    1717*  - Generates either x87 asm code, SSE2 asm code or pure D, depending on the complexity of 
    1818*    the expression, and the availability of inline asm. 
    19 *  - 80-bit precision is used whenever possible. 
     19*  - If x87 code is generated, 80-bit precision is used whenever possible. 
    2020*  - Supports mixed-length operations (eg, real[] + double[] + float[]). 
    2121* 
    22 * TWO STEPS FORWARD, ONE STEP BACK: 
    23 *  The previous version supported additional functionality. This has not yet been 
    24 *  added after refactoring. 
    25 *  - Supports real, complex and imaginary vectors, and detects type mismatches between them. 
    26 *  - When static arrays are used, mismatches in array length are detected 
    27 *    at compile time. 
    2822* 
    2923* BUGS/ FUTURE DIRECTIONS: 
     
    4337* a string containing x87 asm, which is then mixed into a function which accepts the tuple. 
    4438* 
    45 * HISTORY: 
    46 * 0.1 - Used classes to make expression templates. 
    47 * 0.2 - Support for a wider variety of expressions. Dot product, imaginary numbers, etc. 
    48 * 0.3 - Based on string mixins. Most of the new features of 0.2 are gone, but SSE2 is added. 
    4939*/ 
    5040 
     
    380370    for (int i=0; i< ranklist.length;++i) { 
    381371        if (ranklist[i]==1){ 
    382             result~= "  auto vec" ~ itoa(i) ~ " = values[" ~itoa(i) ~"].ptr; // " ~ cast(char)('A'+i)~ \n; 
    383372            if (typelist[i]=="real") { 
    384373                incrementRealVectors ~= "  add " ~ vectorRegister[vecnum] ~ ", " ~ REALSIZE ~ ";\n"; 
    385374            } 
    386375            ++vecnum; 
    387         } else result~= " alias values["~itoa(i)~"] val" ~ itoa(i) ~ "; // " ~ cast(char)('A'+i)~ \n; 
    388     } 
    389  
    390     result ~= "  int veclength = values[" ~itoa(findFirstVector(ranklist)) ~"].length;\n"; 
    391   
     376        } 
     377    } 
     378 
    392379    int numScalarsOnStack=0; 
    393380 
     
    403390      if (ranklist[i]==1) { 
    404391          if (typelist[i]=="real") { 
    405               result ~= "  mov " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"; 
     392              result ~= "  mov " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 
    406393          } else  { 
    407394            result ~= "  lea " ~ vectorRegister[numvecs] 
    408395              ~ ", [" ~ vectorSize(typelist[i]) ~ "*EAX];   " 
    409               ~ "  add " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"; 
     396              ~ "  add " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 
    410397         } 
    411398         result ~= "  //" ~ cast(char)('A'+i) ~ \n;  
     
    479466        } else { 
    480467            // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 
    481             next = "  fmul " ~ operandSize(typelist[operations[done]-'A']) ~ "val" ~ itoa(operations[done]-'A') ~"; // * " ~ operations[done..done+1] ~ "\n"; 
     468            next = "  fmul " ~ operandSize(typelist[operations[done]-'A']) ~ "values[" ~ itoa(operations[done]-'A') ~"]; // * " ~ operations[done..done+1] ~ "\n"; 
    482469            mainbody ~= next; firstbody ~= next; 
    483470        } 
     
    522509public: 
    523510 
    524 // We don't need types for SSE2, everything is a double. 
    525  
    526 char [] generateCodeForSSE2(int [] ranklist, char [] infixOperations, char cumulatingOp=0) 
     511/** Generate BLAS1 asm code which is optimal for CPUs with SSE2. 
     512 * We don't need types for SSE2. Everything is a double; vectors are double* 
     513 */ 
     514char [] generateCodeForSSE(int SSEVer, int [] ranklist, char [] infixOperations, char cumulatingOp=0) 
    527515{ 
    528516    char [] operations = makePostfixForSSE(infixOperations, ranklist); 
     
    531519    result ~= "// Operation : " ~  operations ~ \n; 
    532520 
    533     // Create local variables for pointers to vectors (avoid bug #1125) 
    534     // Bad code is also generated for loading scalars. 
    535     int vecnum = 0; 
    536     for (int i=0; i< ranklist.length;++i) { 
    537         if (ranklist[i]==1){ 
    538             result~= "  auto vec" ~ itoa(i) ~ " = values[" ~itoa(i) ~"].ptr; // " ~ cast(char)('A'+i)~ \n; 
    539             ++vecnum; 
    540         } 
    541     } 
    542     result ~= "  int veclength = values[" ~itoa(findFirstVector(ranklist)) ~"].length;\n"; 
    543   
     521    int numvecs = countVectors(ranklist); 
    544522    int numScalarsOnStack=0; 
    545523 
    546     result~= \n"asm {"\n ~ pushRegisters(vecnum); 
     524    result~= \n"asm {"\n ~ pushRegisters(numvecs); 
    547525    // EAX will be the counter 
    548526    result ~= "  mov EAX, veclength;"\n; 
    549527    // Load all the vector pointers into registers 
    550528 
    551     const char [] vectorsize = "8"; // size of a double 
    552     int numvecs=0; 
     529    char [] vectorsize = (SSEVer == 2) ? "8" :"4"; // size of a double 
     530     
     531    int vecregnum = 0; 
    553532    int numconsts=0; 
    554533    for (int i=0; i<ranklist.length; ++i) { 
    555534      if (ranklist[i]==1) { 
    556         result ~= "  lea " ~ vectorRegister[numvecs
     535        result ~= "  lea " ~ vectorRegister[vecregnum
    557536          ~ ", [" ~ vectorsize ~ "*EAX];   " 
    558           ~ "  add " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"; 
     537          ~ "  add " ~ vectorRegister[vecregnum] ~ ", values[" ~ itoa(i) ~ "];"; 
    559538         result ~= "  //" ~ cast(char)('A'+i) ~ \n;  
    560         ++numvecs
     539        ++vecregnum
    561540      } else if (ranklist[i]==0) { 
     541          // load scalar into an XMM register, and then duplicate it into both 
     542          // halves of the register using shufpd. 
    562543          result ~= "  movsd " ~ XMM(numconsts) ~ ", values["~ itoa(i) ~"]; " 
    563 //          result ~= "  movsd " ~ XMM(numconsts) ~ ", double ptr val"~ itoa(i) ~"; " 
    564544            "  shufpd " ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; 
    565545          ++numconsts; 
     
    614594        ~ "L1:\n" ~ mainbody; 
    615595    result ~= "  add EAX,2;\n" ~ "  js L1;\n"; 
    616     result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 
     596    result~= "L3:" \n ~ popRegisters(numvecs) ~ "}\r\n"; 
    617597    
    618598    return result;