Show
Ignore:
Timestamp:
04/30/08 16:05:32 (5 months ago)
Author:
Don Clugston
Message:

Added prod(). Use .ptr to get raw data, so it works with Bill Baxter's ArrayView?.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/blade/CodegenX86.d

    r172 r187  
    11//  Written in the D programming language 1.0 
    22/** 
    3 * BLADE 0.4Alpha -- Basic Linear Algebra D Expressions 
     3* BLADE Alpha -- Basic Linear Algebra D Expressions 
    44* 
    55* Generate near-optimal x87/SSE/SSE2 asm code for BLAS1 basic vector operations 
     
    2121* 
    2222* BUGS/ FUTURE DIRECTIONS: 
    23 *  None of these support dot product, or matrix operations. 
     23*  None of these support matrix operations. 
    2424* X87: 
    2525*  - Not optimal for the case of multiple real vectors (they could share a counter). 
     
    3030*   (to do this, need naked asm with no stack frame). 
    3131* SSE/SSE2: 
    32 *  - SSE functions don't support unaligned data. Need to generate seperate code  
     32*  - SSE functions don't support unaligned data. Need to generate seperate code 
    3333*    for that case (NOTE: probably only worth doing for small expressions). 
    3434* 
     
    191191// (max # temporaries + max # real scalars) must be <=8, otherwise FPU stack 
    192192// will overflow). 
    193 const int MAX_87_REALSCALARSPLUSTEMPORARIES = 8;  
     193const int MAX_87_REALSCALARSPLUSTEMPORARIES = 8; 
    194194 
    195195private: 
     
    213213// indexed by i. 
    214214char [] indexedVector(char [][] typelist, char [] ranklist, char [] stridelist, char var) 
    215 {     
     215{ 
    216216    if (typelist[var-'A']=="real") return " real ptr [" ~ vectorRegister[vectorNum(ranklist, var)] ~ "]"; 
    217217    else if (stridelist[var-'A']=='1') return operandSize(typelist[var-'A']) ~ "[" ~ vectorRegister[vectorNum(ranklist, var)] ~ "]"; 
     
    264264 (Pentium, PMMX, PII, PIII). It is also optimal for recent x86 CPUs 
    265265 where vector sizes are mixed. 
    266   
     266 
    267267 There are two cases: 
    268268 (A) DAXPY-style loops, where every element is independent of the other indices; 
     
    271271 For cumulative loops, best performance is achieved with loop unrolling and 
    272272 multiple accumulators, in order to break dependency chains. 
    273    
     273 
    274274The key optimisation rules for DAXPY loops are: 
    275275 1. keep the loop overhead to one clock cycle if possible. 
     
    304304            ranklist~="1"; 
    305305            typelist ~= typeof(T[0]).stringof; 
    306         } else static if (is(typeof(T.data))) {             
     306        } else static if (is(typeof(T.data))) { 
    307307            stridelist~="1"; 
    308308            ranklist~="1"; 
     
    323323    char [] result=""; 
    324324    char [] incrementRealVectors=""; 
    325      
     325 
    326326    result ~= "// Operation : " ~  operations ~ \n; 
    327     
     327 
    328328    // Create local variables for pointers to vectors (avoid bug #1125) 
    329329 
     
    361361              ~ "  add " ~ vectorRegister[numvecs] ~ ", values[" ~ itoa(i) ~ "];"; 
    362362         } 
    363          result ~= "  //" ~ cast(char)('A'+i) ~ \n;  
     363         result ~= "  //" ~ cast(char)('A'+i) ~ \n; 
    364364        ++numvecs; 
    365365      } else if (typelist[i]=="real") { 
     
    367367          ++numconsts; 
    368368          ++numScalarsOnStack; 
    369          result ~= "  //" ~ cast(char)('A'+i) ~ \n;  
     369         result ~= "  //" ~ cast(char)('A'+i) ~ \n; 
    370370      } 
    371371    } 
     
    376376    int numOnStack = 0; // How much of the FP stack is being used? 
    377377 
    378     bool isCumulative = (operations[0]=='0'); 
     378    bool isCumulative = (operations[0]=='0' || operations[0]=='1'); 
    379379    if (operations[0]=='0') { 
    380380        result ~= "  fldz;"\n; // dot product 
    381381        ++numOnStack; 
    382382        done = 1; 
     383    } else if (operations[0]=='1') { 
     384        result ~= "  fld1;"\n; // prod 
     385        ++numOnStack; 
     386        done = 1; 
    383387    } 
    384388    result ~= "  xor EAX, EAX; "\n 
     
    389393    // the final storage instruction, because of the FST latency). 
    390394    char [] mainbody = ""; 
    391              
     395 
    392396    while(done<operations.length) { 
    393397        char [] next; 
     
    420424        } else if (operations[done]==',') { 
    421425            mainbody ~= "  " ~ opToX87[operations[done+1]] ~ " ST, ST(0);    // dup " ~ operations[done+1] ~ \n; 
    422             done+=2;           
     426            done+=2; 
    423427        } else if (ranklist[operations[done]-'A']=='1') { 
    424428             // An operation will be performed between the stack top and a vector. 
     
    430434                // it chains. 
    431435                next = ((done+2 == operations.length)? "  fstp " : "  fst ") 
    432                     ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment;             
     436                    ~ indexedVector(typelist, ranklist, stridelist, operations[$-2] ) ~ comment; 
    433437            } else if (typelist[operations[done]-'A']=="real") { 
    434438                 // 80-bit vectors must be loaded onto the FPU stack first 
     
    445449            // Multiply by real scalar, which is already on the stack. 
    446450            next = "  fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - realScalarNum(typelist, ranklist, operations[done]-'A')-1) ~ "); // * " ~ operations[done] ~ \n; 
    447             mainbody ~= next;             
     451            mainbody ~= next; 
    448452          } else { 
    449453            // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 
     
    452456          } 
    453457            done +=2; 
    454         }       
    455     } 
    456          
    457     result ~= \n  
    458         ~ "  align 4;\n"  
     458        } 
     459    } 
     460 
     461    result ~= \n 
     462        ~ "  align 4;\n" 
    459463        ~ "L1:\n" ~ mainbody; 
    460          
     464 
    461465//    if (cumulatingOp) result ~= "  " ~ opToX87[cumulatingOp] ~ "p ST(2), ST;"\n; 
    462466 
     
    472476 
    473477    result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 
    474     
     478 
    475479    return result; 
    476480} 
     
    502506// Note: If SSE4 is available, the dppd and dpps instructions could be 
    503507// used to replace the final *+ in dot-product operations. This would allow 
    504 // an in-order dot product to be performed, but doesn't give much speed 
     508// an in-order dot product to be performed, but otherwise doesn't give much speed 
    505509// improvement. 
    506510 
     
    511515{ 
    512516    char [] result=""; 
    513      
     517 
    514518    result ~= "// Operation : " ~  operations ~ \n; 
    515519 
    516520    int numvecs = countVectors(ranklist); 
    517521    int numScalarsOnStack=0; 
    518     bool isCumulative = (operations[0]=='0')
     522    bool isCumulative = (operations[0]=='0') || (operations[0]=='1')
    519523    if (isCumulative) result ~= (usingDoubles? "  double" : "  float") ~" sum;"\n; 
    520524 
     
    526530    char [] vectorsize = usingDoubles? "8" :"4"; // size of a double 
    527531    char [] suffix = usingDoubles? "d " :"s "; 
    528      
     532 
    529533    int vecregnum = 0; 
    530534    int numconsts=0; 
     
    534538          ~ ", [" ~ vectorsize ~ "*EAX];   " 
    535539          ~ "  add " ~ vectorRegister[vecregnum] ~ ", values[" ~ itoa(i) ~ "];"; 
    536          result ~= "  //" ~ cast(char)('A'+i) ~ \n;  
     540         result ~= "  //" ~ cast(char)('A'+i) ~ \n; 
    537541        ++vecregnum; 
    538542      } else if (ranklist[i]==0) { 
    539543          // load scalar into an XMM register, and then duplicate it into both 
    540           // halves (or quarters) of the register using shufpd.           
     544          // halves (or quarters) of the register using shufpd. 
    541545          result ~= "  movs" ~ suffix ~ XMM(numconsts) ~ ", values[" ~ itoa(i) ~ "]; " 
    542546            "  shufp" ~ suffix ~ XMM(numconsts) ~", " ~ XMM(numconsts) ~ ",0; //" ~ cast(char)('A'+i) ~ \n; 
     
    553557    // reusing the same registers, because the CPUs which support SSE 
    554558    // also have extensive support for register renaming. 
    555      
     559 
    556560    int numOnStack = numScalarsOnStack; // How much of the FP stack is being used? 
    557561    int done=0; 
    558562    if (operations[0]=='0') { 
    559563        result ~= "  pxor " ~ XMM(numOnStack) ~ "," ~ XMM(numOnStack) ~ ";  // 0\n"; 
     564        ++numOnStack; 
     565        ++done; 
     566    } else if (operations[0]=='1') { 
     567        result ~= "  movap" ~ suffix ~ XMM(numOnStack) ~ ", SSE_ONEp" ~ suffix ~ ";  // 1\n"; 
    560568        ++numOnStack; 
    561569        ++done; 
     
    599607                extra ~= "  " ~ opToSSESingle[operations[done+1]] ~ suffix ~ " " ~ XMM(numOnStack-1) ~ ", " 
    600608                    ~ XMM(numOnStack-1) ~ comment; 
    601                 done +=2;           
     609                done +=2; 
    602610        } else if (ranklist[operations[done]-'A']=='1') { 
    603611             // An operation will be performed between the stack top and a vector. 
     
    619627            extra ~= "  " ~ opToSSESingle[operations[done+1]] ~ suffix ~ XMM(numOnStack-1) ~ ", " ~ XMM(u) ~ comment; 
    620628            done +=2; 
    621         }       
    622     } 
    623          
    624     result ~= \n  
    625         ~ "  align 16;\n"  
     629        } 
     630    } 
     631 
     632    result ~= \n 
     633        ~ "  align 16;\n" 
    626634        ~ "L1:\n" ~ mainbody; 
    627635    if (usingDoubles) { 
     
    640648    if (isCumulative) { 
    641649        // Result is now in XMM(numScalarsOnStack). We need to do a horizontal 
    642         // add to get the final sum. 
     650        // add or multiply to get the final sum. 
     651        char [] cumInstr = operations[0]=='0' ? "  add" : "  mul"; 
    643652        if (usingDoubles) { 
    644653            // For SSE3, use   haddpd XMM(numScalarsOnStack). 
    645654            result ~= "  movhlps " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ";"\n 
    646             ~ "  addsd "  ~ XMM(numScalarsOnStack) ~ "," ~  XMM(numScalarsOnStack+1) ~ ";\n";            
     655            ~ cumInstr ~ "sd "  ~ XMM(numScalarsOnStack) ~ "," ~  XMM(numScalarsOnStack+1) ~ ";\n"; 
     656 
     657 
    647658        } else { // floats 
    648659            result ~= "  movhlps " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ";"\n 
    649             ~ "  addps "  ~ XMM(numScalarsOnStack) ~ "," ~  XMM(numScalarsOnStack+1) ~ ";\n" 
     660            ~ cumInstr ~ "ps "  ~ XMM(numScalarsOnStack) ~ "," ~  XMM(numScalarsOnStack+1) ~ ";\n" 
    650661            ~ "  pshufd " ~ XMM(numScalarsOnStack+1) ~ "," ~ XMM(numScalarsOnStack) ~ ",1;"\n 
    651             ~ "  addss "  ~ XMM(numScalarsOnStack) ~ "," ~  XMM(numScalarsOnStack+1) ~ ";\n"; 
     662            ~ cumInstr ~ "ss "  ~ XMM(numScalarsOnStack) ~ "," ~  XMM(numScalarsOnStack+1) ~ ";\n"; 
    652663        } 
    653664        result ~= "  movs" ~ suffix ~ " sum," ~ XMM(numScalarsOnStack) ~ ";"\n; 
     
    656667    result ~= popRegisters(numvecs) ~ "}\n"; 
    657668    if (isCumulative) result ~= "  return sum;"\n; 
    658     
     669 
    659670    return result; 
    660671}