Changeset 166

Show
Ignore:
Timestamp:
12/21/07 03:17:57 (8 months ago)
Author:
Don Clugston
Message:

X87 now generates correct code for chained assignment.
X87 now does subexpression folding for a couple of cases: subexpr+subexpr, also for subexpr*subexpr (although that won't be used until the front-end enables dot product).

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/blade/BladeDemo.d

    r165 r166  
    3838    
    3939    mixin(vectorize("q+= q*2.01")); 
    40     mixin(vectorize("r-=another[0]")); 
     40    mixin(vectorize("another[0]+=r-=another[0]+another[0]")); 
    4141    
    4242    // All of the next four are equivalent 
    4343    mixin(vectorize("a+=6*another[1,0..$]")); 
    44 /*     
     44    mixin(vectorize("a+=6*(another[1,0..$]+another[1,0..$])")); 
     45 
    4546    mixin(vectorize("a+=6*another[1][0..$]")); 
    4647    mixin(vectorize("a+=6*another[1]")); 
     
    4849    // I don't think I'll support this syntax long-term. 
    4950    mixin(vectorize("a+=6*another[1,[0,$]]")); 
    50 */     
     51     
    5152    // Strided vector 
    5253    mixin(vectorize("another[0..$,1]=6*a[0..2]")); 
  • trunk/blade/BladeSimplify.d

    r165 r166  
    6363    else { 
    6464        char [] expr2 = removeDuplicates(tree); 
    65         // Check for rank errors         
     65        // Check for rank errors 
    6666        int wholerank = exprRank(expr2, ranks); 
    6767        if (wholerank<0) 
  • trunk/blade/CodegenX86.d

    r165 r166  
    1515* FEATURES: 
    1616*  - Supports any mix of vector addition, subtraction, dot product, and multiplication 
    17 *    by a scalar
     17*    by a scalar, with strided vector access
    1818*  - Generates either x87, SSE, or SSE2 asm code. 
    1919*  - If x87 code is generated, 80-bit precision is used whenever possible. 
     
    2323*  None of these support dot product, or matrix operations. 
    2424* X87: 
    25 * The x87 code generation targets early Pentiums, which are now irrelevant. 
    26 * It needs to be updated to PM/Core2 (this will significantly simplify it). 
    2725*  - Not optimal for the case of multiple real vectors (they could share a counter). 
    2826*  - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required). 
     
    4947private: 
    5048 
    51 // num chars before we get a comma. 
    52 int paramLength(char [] s) 
    53 { 
    54     for (int i=0; i<s.length; ++i) { 
    55         if (s[i]==',') return i; 
    56     } 
    57     assert(0); 
    58 } 
    59  
    6049// -------------- 
    6150// Ranklist functions 
     
    278267 multiple accumulators, in order to break dependency chains. 
    279268   
    280 The key optimisation rules are: 
     269The key optimisation rules for DAXPY loops are: 
    281270 1. keep the loop overhead to one clock cycle if possible. 
    282271 2. (FMUL latency) don't use the result of a multiply immediately 
    283  3. (FST latency) don't save a value to memory immediately after it's calculated. 
    284  4. (AGI stall) don't use the counter variable immediately after it's modified. 
    285272Techniques to address these are: 
    286273 1. Use EAX as a counter and index variable, which begins negative and counts UP to zero. 
     274    Combine counters for all packed doubles and floats into this single counter. 
    287275 2. The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. 
    288  3. The latency of fstp is avoided by calculating a result in one iteration, 
    289      but not storing it to memory until the subsequent iteration. 
    290  4. (NOT YET IMPLEMENTED): first operation in the loop should be loading a scalar (for a multiply), 
    291     if possible, otherwise load an 80-bit vector, if possible. 
    292276 
    293277The generated code is of the form: 
     
    295279 load scalars onto FPU stack 
    296280 load vector pointers into EAX, EBX, ... 
    297  calculate result[0] into ST(0) 
    298  goto L2 
    299281L1: 
    300  calculate result[i+1] into ST(0) 
    301  swap so that result[i] is in ST(0) 
    302 L2: 
    303  store result[i] 
    304  increment pointers, goto L1 if i<n-1 
    305  store result[n-1] 
     282 calculate result into ST(0) 
     283 increment pointers 
     284 goto L1 if not done 
    306285 pop scalars off FPU stack 
    307286---- 
     
    397376    // the final storage instruction, because of the FST latency). 
    398377    char [] mainbody = ""; 
    399 //    char [] firstbody = ""; 
    400 //    char [] storage = ""; 
    401378 
    402379    // We need to keep track of how many things are on the FPU stack. 
     
    423400            ++done; 
    424401            numOnStack++; 
     402      } else if (operations[done]==',') { 
     403          mainbody ~= "  " ~ opToX87[operations[done+1]] ~ " ST, ST(0);    // dup " ~ operations[done+1] ~ \n; 
     404          done+=2;           
    425405      } else if (ranklist[operations[done]-'A']=='1') { 
    426406             // An operation will be performed between the stack top and a vector. 
     
    429409            char [] comment = ";  // " ~ operations[done..done+2] ~ \n; 
    430410            if (operations[done+1]=='=') { 
    431                 next = "  fstp " ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment; 
     411                // If it's the last operation, pop it from the stack; otherwise, 
     412                // it chains. 
     413                next = ((done+2 == operations.length)? "  fstp " : "  fst ") 
     414                    ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment;             
    432415            } else if (typelist[operations[done]-'A']=="real") { 
    433416                 // 80-bit vectors must be loaded onto the FPU stack first 
  • trunk/blade/PostfixX86.d

    r159 r166  
    6565            return second ~ first ~ "="; 
    6666        } 
     67        if (second == first) return first ~ "," ~ op; 
    6768 
    6869        // x87 OPTIMISATION #1