Changeset 100

Show
Ignore:
Timestamp:
04/14/07 15:51:58 (1 year ago)
Author:
Don Clugston
Message:

Moved parsing and codegeneration steps into different modules.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/blade/Blade.d

    r99 r100  
    4747*/ 
    4848module Blade; 
     49import BladeParse; 
     50import CodegenX87 : isX87AsmPossible, isSSE2AsmPossible, 
     51    generateCodeForAsmX87, generateCodeForAsmSSE2; 
     52 
    4953import std.stdio; 
    5054import std.string; 
     
    5862    alias X MakeTuple; 
    5963} 
    60  
    61 char [] itoa(T)(T x) 
    62 { 
    63     char [] s=""; 
    64     static if (is(T==byte)||is(T==short)||is(T==int)||is(T==long)) { 
    65         if (x<0) { 
    66             s = "-"; 
    67             x = -x; 
    68         } 
    69     } 
    70     do { 
    71         s = cast(char)('0' + (x%10)) ~ s; 
    72         x/=10; 
    73     } while (x>0); 
    74     return s; 
    75 } 
    76  
    77  
    7864 
    7965// ------------------------------------------- 
     
    216202 
    217203/** This the only place where any code is generated. 
    218  * Firstly, runtime checks are made of vector lengths. 
    219204 */ 
    220205ReturnType performOperation(ReturnType, char [] operations, char [] finaloperation, int knownlength, X...)(X expr) 
    221206{ 
    222   const char [] tupstr = vectorTupleToString!(X); 
    223   mixin(generateVectorLengthChecks(tupstr)); 
    224           version(BladeDebug) { 
    225             pragma(msg, finaloperation ~ " " ~ operations ~ 
    226                 "\nPostfix: " ~ makePostfixForX87(operations, tupstr) ~ "\nTuple: " ~ tupstr); 
    227             static if (knownlength!=0) pragma(msg, "Length is known!"); 
    228   static if (isSSE2AsmPossible(tupstr, operations)) { 
    229             const char [] q1 = generateCodeForAsmSSE2(knownlength, tupstr, makePostfixForX87(operations, tupstr), finaloperation); 
    230       pragma(msg, "Generated SSE2 code: "\n ~ q1); 
    231   } else static if (isX87AsmPossible(tupstr, operations)) { 
    232  
    233             const char [] qqq = generateCodeForAsmX87(knownlength, tupstr, makePostfixForX87(operations, tupstr), finaloperation); 
    234             pragma(msg, "Generated x87 code:"\n ~ qqq); 
    235         } else pragma(msg, "Too complicated for x87 -- generating D code instead"); 
    236           } 
    237  
     207  const char [] tuplestr = vectorTupleToString!(X); 
     208  const char [] rettypestr="real"; 
     209  static if (is(RetType: ireal)){ rettypestr="ireal";} 
     210 
     211 // Firstly, runtime checks are made of vector lengths. 
     212  mixin(generateVectorLengthChecks(tuplestr)); 
     213 
     214  version(BladeDebug) { 
     215    pragma(msg, finaloperation ~ " " ~ operations ~ "\nTuple: " ~ tuplestr); 
     216    static if (knownlength!=0) pragma(msg, "Length is known!"); 
     217    pragma(msg, "Generated Code: "\n ~ generateCode(rettypestr, tuplestr, operations, finaloperation, knownlength)); 
     218  } 
     219  mixin(generateCode(rettypestr, tuplestr, operations, finaloperation, knownlength)); 
     220
     221 
     222char [] generateCode(char [] rettypestr, char [] tuplestr, char [] operations, char [] finaloperation, int knownlength) 
     223
     224    char [] a=""; 
    238225  // Decide which code generator to use, based on expression complexity and 
    239226  // assembler availability. 
    240   static if (isSSE2AsmPossible(tupstr, operations)) { 
    241      mixin(generateCodeForAsmSSE2(knownlength, tupstr, makePostfixForX87(operations, tupstr), finaloperation)); 
    242   } else static if (isX87AsmPossible(tupstr, operations)) { 
    243      mixin(generateCodeForAsmX87(knownlength, tupstr, makePostfixForX87(operations, tupstr), finaloperation)); 
     227  if (isSSE2AsmPossible(tuplestr, operations)) { 
     228      return generateCodeForAsmSSE2(knownlength, tuplestr, operations, finaloperation); 
     229  } else if (isX87AsmPossible(tuplestr, operations)) { 
     230      return generateCodeForAsmX87(knownlength, tuplestr, operations, finaloperation); 
    244231  } else { 
    245      mixin(generateCodeForD!(ReturnType)(knownlength, tupstr, finaloperation,operations)); 
     232      return generateCodeForD(rettypestr, knownlength, tuplestr, finaloperation, operations); 
    246233  } 
    247234} 
     235 
    248236 
    249237// ------------------------------------------------ 
     
    277265    static if (X.length==1) const char [] vectorTupleToString = singleTypeToString!(X[0]); 
    278266    else  const char [] vectorTupleToString = singleTypeToString!(X[0]) ~ vectorTupleToString!(X[1..$]); 
    279 } 
    280  
    281 // And some functions to grab information from the typestring. 
    282  
    283 // Return the first index in the tuple which is of vector type 
    284 int findFirstVector(char [] typelist) 
    285 { 
    286     for (int i=0; i< typelist.length;++i) { 
    287         if (isVector(typelist[i])) return i; 
    288     } 
    289     assert(0, typelist); 
    290 } 
    291  
    292 bool isVector(char var) 
    293 { 
    294     return (var=='R' || var=='D' || var=='F' || var=='Z'); 
    295267} 
    296268 
     
    317289 * 
    318290 */ 
    319 char [] generateCodeForD(RetType)(int knownlength, char [] typelist, char [] finalop, char [] operation) 
     291char [] generateCodeForD(char [] rettypestr, int knownlength, char [] typelist, char [] finalop, char [] operation) 
    320292{ 
    321293    char [] iter=""; 
     
    333305    else result = " int length = expr[" ~ itoa(findFirstVector(typelist))~ "].length;\n"; 
    334306    if (finalop == ".") { 
    335         static if ( is(RetType: ireal)) result ~= " ireal sum=0.0i;\n"; 
     307        if ( rettypestr=="ireal") result ~= " ireal sum=0.0i;\n"; 
    336308                                   else result ~= " real sum=0.0;\n"; 
    337309        result ~= " for (int i=0; i<length; ++i) {\n" 
     
    341313            ~ "    expr[" ~ itoa(typelist.length-1L)~"][i]" ~ finalop ~ iter ~";\n }\n"; 
    342314    } 
    343     return result; 
    344 } 
    345  
    346 // ------------------------------------------------ 
    347 //   PART 4 -- Convert expression to postfix form 
    348 // ------------------------------------------------ 
    349  
    350 // return the length of a sub-expression 
    351 int exprLength(char [] s) 
    352 { 
    353     if (s[0]=='#') return 1; 
    354     int numParens=0; 
    355     for (int i=0; i<s.length; ++i) { 
    356         if (s[i]=='(') { numParens++; } 
    357         if (s[i]==')') { numParens--; } 
    358         if (numParens == 0) { return i; } 
    359     } 
    360 } 
    361  
    362 // Converts an infix string into postfix. Also strips off the # symbols. 
    363 // Apply x87-specific optimisations during the conversion. 
    364 char [] makePostfixForX87(char [] operations, char [] typelist) 
    365 { 
    366 //    if (operations.length==1) return operations; 
    367     if (operations.length==2 && operations[0]=='#') return operations[1..$]; 
    368  
    369     int x = exprLength(operations); 
    370     char [] first = operations[0..x+1]; 
    371     char [] second = operations[x+2..$]; 
    372     if (first[0]=='(') { 
    373         first = makePostfixForX87(first[1..$-1], typelist); 
    374     } else if (first[0]=='#') first = operations[1..x+1]; 
    375     if (second[0]=='(') { 
    376         second = makePostfixForX87(second[1..$-1], typelist); 
    377     } else if (second[0]=='#') second = operations[x+3..$]; 
    378  
    379     // x87 OPTIMISATION #1 
    380     // On x87, fmul has a long latency, so we want to delay using the 
    381     // result of a multiply. Since + is commutative, we can achieve this 
    382     // by calculating the value with the multiply, before the other one. 
    383     // Note that there a few cases that could still be improved, eg with 
    384     //    ((a*b)+(c*d))+(e*f),  all three multiplies could be performed 
    385     // before any of the additions. This would require stack rotation 
    386     // operations (can't be done with simple postfix), greatly increasing the 
    387     // complexity of the mini-compiler.). 
    388     if (operations[x+1]=='+') { 
    389         if (second[$-1]=='*' && first[$-1]!='*') { 
    390            return second ~ first ~ operations[x+1..x+2]; 
    391         } 
    392     } 
    393     // We can also do the same thing with -, but we'll need to use fsubr 
    394     // instead of fsub. We use _ to mean reversed subtraction. 
    395     if (operations[x+1]=='-') { 
    396         if (second[$-1]=='*' && first[$-1]!='*') { 
    397            return second ~ first ~ "_"; 
    398         } 
    399     } 
    400     // x87 OPTIMISATION #2 
    401     // When an operation is performed between a real[] and a non-real[], 
    402     // we want to have the real[] being the one which is loaded first. 
    403     if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='+') { 
    404            return second ~ first ~ "+"; 
    405     } 
    406     if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='-') { 
    407            return second ~ first ~ "_"; 
    408     } 
    409     return first ~ second ~ operations[x+1..x+2]; 
    410 } 
    411  
    412 // ------------------------------- 
    413 //   PART 5 -- Mixins to generate x87 ASM code 
    414 // ------------------------------- 
    415  
    416 bool isInstruction(char op) 
    417 { 
    418     return (op=='+' || op=='*' || op=='-'|| op=='.'|| op=='_'); 
    419 } 
    420  
    421 // Count the number of vectors in the typestring 
    422 int countVectors(char [] typelist) 
    423 { 
    424     int numVecs=0; 
    425     for (int i=0; i<typelist.length; ++i) { 
    426         if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F' || typelist[i]=='Z') ++numVecs; 
    427     } 
    428     return numVecs; 
    429 } 
    430  
    431 // Count the number of temporaries which occur in the postfix expression. 
    432 int countTemporaries(char [] postfix) 
    433 { 
    434 // A temporary occurs whenever we load two values without an operation performed on the 
    435 // first one. 
    436     int numTemps=0; 
    437     for (int i=1; i<postfix.length; ++i) { 
    438         if (!isInstruction(postfix[i-1]) && !isInstruction(postfix[i])) numTemps++; 
    439     } 
    440     return numTemps; 
    441 } 
    442  
    443 // The maximum number of simultaneous temporary values in the postfix expression. 
    444 int maxActiveTemporaries(char [] postfix) 
    445 { 
    446     int maxTemps=0; 
    447     int numTemps=0; 
    448     for (int i=1; i<postfix.length; ++i) { 
    449         if (!isInstruction(postfix[i-1]) && !isInstruction(postfix[i])) numTemps++; 
    450         if (isInstruction(postfix[i])) numTemps--; 
    451         if (maxTemps<numTemps) maxTemps=numTemps; 
    452     } 
    453     return maxTemps; 
    454  
    455 } 
    456  
    457 int vectorNum(char [] typelist, char var) 
    458 { 
    459     int numVecs=0; 
    460     for (int i=0; i<var-'a'; ++i) { 
    461         if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F' || typelist[i]=='Z') ++numVecs; 
    462     } 
    463     return numVecs; 
    464 } 
    465  
    466 int realScalarNum(char [] typelist, char var) 
    467 { 
    468     int k=0; 
    469     for (int i=0; i<var-'a'; ++i) { 
    470         if (typelist[i]=='r') ++k; 
    471     } 
    472     return k; 
    473 } 
    474  
    475  
    476 char [] operandSize(char var) 
    477 { 
    478     switch(var) { 
    479         case 'r': 
    480         case 'R': return "real ptr "; 
    481         case 'd': 
    482         case 'D': return "double ptr "; 
    483         case 'f': 
    484         case 'F': return "float ptr "; 
    485     } 
    486 } 
    487  
    488 char [] opToX87(char op) 
    489 { 
    490     switch (op) { 
    491         case '*': 
    492         case '.': return "fmul"; 
    493         case '+': return "fadd"; 
    494         case '-': return "fsub"; 
    495         case '_': return "fsubr"; 
    496     } 
    497 } 
    498  
    499 static if (real.sizeof==10)      const char [] REALSIZE = "10"; 
    500 else static if (real.sizeof==12) const char [] REALSIZE = "12"; 
    501 else static if (real.sizeof==16) const char [] REALSIZE = "16"; 
    502  
    503 char [] vectorSize(char vartype) 
    504 { 
    505     switch (vartype) { 
    506         case 'D': return "8"; 
    507         case 'F': return "4"; 
    508         case 'R': return REALSIZE; 
    509     } 
    510 } 
    511  
    512 // First, use the scratch registers (EAX, ECX, EDX). If that's not enough, 
    513 // use EBX, ESI, and EDI. Finally, use the frame register EBP. 
    514 const char [][5] vectorRegister = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 
    515  
    516 // Is this expression simple enough for the x87 code generator? 
    517 bool isX87AsmPossible(char [] typelist, char [] operations) { 
    518   version (D_InlineAsm_X86) { 
    519         // Are there enough index registers? 
    520         if (countVectors(typelist) > vectorRegister.length) return false; 
    521         // Does it contain any types we can't deal with? 
    522         foreach(ch; typelist) { 
    523             // can only do float, double, and 80-bit vectors, and scalars. 
    524             if (ch!='R' && ch!='D' && ch!='F' && ch!='r' && ch!='d' && ch!='s') return false; 
    525         } 
    526         // BUG: should also check if it will overflow the FPU stack 
    527         return true; 
    528   } else { 
    529       // Without an assembler, there's no chance! 
    530       return false; 
    531   } 
    532 } 
    533  
    534 // Is this expression simple enough for the SSE2 code generator? 
    535 bool isSSE2AsmPossible(char [] typelist, char [] operations) 
    536 { 
    537   version (D_InlineAsm_X86) { 
    538         // Does it contain any types we can't deal with? 
    539         foreach(ch; typelist) { 
    540             // can only do double vectors and double scalars. 
    541             if (ch!='D' && ch!='d') return false; 
    542         } 
    543         return false; // not yet implemented 
    544   } else { 
    545       // Without an assembler, there's no chance! 
    546       return false; 
    547   } 
    548 } 
    549  
    550 // Create code to push all used vector registors. 
    551 char [] pushRegisters(int numVectors) 
    552 { 
    553     char [] result = "  push ESI;"; 
    554     for (int i=3; i<numVectors; ++i) result~= " push " ~ vectorRegister[i] ~ ";"; 
    555     return result ~ "\n"; 
    556 } 
    557  
    558 // Create code to pop all used vector registors. 
    559 char [] popRegisters(int numVectors) 
    560 { 
    561     char [] result = "  "; 
    562     for (int i=numVectors-1; i>=3; --i) result~= "pop " ~ vectorRegister[i] ~ "; "; 
    563     return result ~ "pop ESI;\n"; 
    564 } 
    565  
    566 char [] indexedVector(char [] typelist, char var) 
    567 { 
    568     if (typelist[var-'a']=='R') return " real ptr [" ~ vectorRegister[vectorNum(typelist, var)] ~ "]"; 
    569     return operandSize(typelist[var-'a']) ~ "[" ~ 
    570             vectorRegister[vectorNum(typelist, var)] ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; 
    571 } 
    572  
    573 char [] storeVector(char type, int vecnum) 
    574 { 
    575     char [] stride = " - " ~ vectorSize(type); 
    576     if (type=='R') { 
    577         return "  fstp real ptr [" ~ vectorRegister[vecnum] ~ stride ~ "];"\n; 
    578     } else { 
    579         return "  fstp " ~ operandSize(type) ~ " [" ~ vectorRegister[vecnum] ~ " + " ~ vectorSize(type)~ "*ESI" ~ stride ~ "];"\n; 
    580     } 
    581 } 
    582  
    583 char [] opToSSE2(char op) 
    584 { 
    585     switch (op) { 
    586         case '*': 
    587         case '.': return "mulpd"; 
    588         case '+': return "addpd"; 
    589         case '-': return "subpd"; 
    590         case '_': return "**BUG**"; // Non-existent! 
    591     } 
    592 } 
    593  
    594 char [] opToSSE(char op) 
    595 { 
    596     switch (op) { 
    597         case '*': 
    598         case '.': return "mulps"; 
    599         case '+': return "addps"; 
    600         case '-': return "subps"; 
    601         case '_': return "**BUG**"; // Non-existent! 
    602     } 
    603 } 
    604  
    605  
    606 char [] generateCodeForAsmSSE2(int knownlength, char [] typelist, char [] operations, char [] finaloperation) 
    607 { 
    608 // Use ESI as the index register. 
    609     char [] result="asm {"\n 
    610     ~"L1: \n" 
    611     ~ "  movapd XMM1, [ESI+EAX];"\n 
    612     ~ "  mulpd XMM1, XMM2;"\n 
    613     ~ "  addpd XMM1, [EDI+ESI];"\n 
    614     ~ "  movapd [EDI+ESI], XMM1;"\n 
    615     ~ "  add ESI, 16;"\n 
    616     ~ "  js L1;"\n 
    617     ~ "}"\n; 
    618     return result; 
    619 } 
    620  
    621 /** Generate asm code which is optimal for x87 CPUs without SSE2. 
    622  (Pentium, PMMX, PII, PIII). It is also optimal for recent x86 CPUs 
    623  where vector sizes are mixed. 
    624 The key optimisation rules are: 
    625  1. keep the loop overhead to one clock cycle if possible. 
    626  2. (FMUL latency) don't use the result of a multiply immediately 
    627  3. (FST latency) don't save a value to memory immediately after it's calculated. 
    628  4. (AGI stall) don't use the counter variable immediately after it's modified. 
    629 Techniques to address these are: 
    630  1. Use ESI as a counter and index variable, which begins negative and counts UP to zero. 
    631  2. The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. 
    632  3. The latency of fstp is avoided by calculating a result in one iteration, 
    633      but not storing it to memory until the subsequent iteration. 
    634  4. (NOT YET IMPLEMENTED): first operation in the loop should be loading a scalar (for a multiply), 
    635     if possible, otherwise load an 80-bit vector, if possible. 
    636  
    637 The generated code is of the form: 
    638 ---- 
    639  load scalars onto FPU stack 
    640  load vector pointers into EAX, EBX, ... 
    641  calculate result[0] into ST(0) 
    642  goto L2 
    643 L1: 
    644  calculate result[i+1] into ST(0) 
    645  swap so that result[i] is in ST(0) 
    646 L2: 
    647  store result[i] 
    648  increment pointers, goto L1 if i<n-1 
    649  store result[n-1] 
    650  pop scalars off FPU stack 
    651 ---- 
    652  
    653 */ 
    654 char [] generateCodeForAsmX87(int knownlength, char [] typelist, char [] operations, char [] finaloperation) 
    655 { 
    656     char [] result=""; 
    657     char [] incrementRealVectors=""; 
    658  
    659     // Create local variables for pointers to vectors (avoid bug #1125) 
    660     int vecnum = 0; 
    661     for (int i=0; i< typelist.length;++i) { 
    662         if (isVector(typelist[i])){ 
    663             result~= "  auto vec" ~ itoa(i) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; 
    664             if (typelist[i]=='R') { 
    665                 incrementRealVectors ~= "  add " ~ vectorRegister[vecnum] ~ ", " ~ REALSIZE ~ ";\n"; 
    666             } 
    667             ++vecnum; 
    668         } 
    669     } 
    670     if (knownlength==0) { 
    671         result ~= "  int veclength = expr[" ~itoa(findFirstVector(typelist)) ~"].length;\n"; 
    672     } 
    673  
    674     bool isDotProduct = (operations[$-1]=='.'); 
    675     int numScalarsOnStack=0; 
    676  
    677     result~= \n"asm {"\n ~ pushRegisters(vecnum); 
    678     // ESI will be the counter 
    679     if (knownlength>0) result~= "  mov ESI, " ~ itoa(knownlength) ~";\n"; 
    680     else result ~= "  mov ESI, veclength;"\n; 
    681  
    682     // Load all the vector pointers into registers, and push all the scalars onto the stack 
    683  
    684     int numvecs=0; 
    685     int numconsts=0; 
    686     for (int i=0; i<typelist.length; ++i) { 
    687       if (isVector(typelist[i])) { 
    688           if (typelist[i]=='R') { 
    689               result ~= "  mov " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; 
    690           } else  { 
    691             result ~= "  lea " ~ vectorRegister[numvecs] 
    692               ~ ", [" ~ vectorSize(typelist[i]) ~ "*ESI];   " 
    693               ~ "  add " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; 
    694          } 
    695         ++numvecs; 
    696       } else if (typelist[i]=='r') { 
    697           result ~= "  fld real ptr expr["~ itoa(i) ~"];\n"; 
    698           ++numconsts; 
    699           ++numScalarsOnStack; 
    700       } 
    701     } 
    702  
    703     if (isDotProduct) result ~= "  fldz;"\n; 
    704     if (knownlength>0) result~= "  mov ESI, -" ~ itoa(knownlength) ~";\n"; 
    705     else { 
    706         result ~= "  xor ESI, ESI; "\n 
    707             "  sub ESI, veclength; // counter=-length"\n 
    708             "  jz short L3; // test for length==0"\n; 
    709     } 
    710     if (!isDotProduct && operations.length==1 && finaloperation[0]=='*') { 
    711             ++numScalarsOnStack; 
    712             // load multiplier for *= 
    713             result ~= "  fld double ptr expr[0];\n"; 
    714     } 
    715     int done=0; 
    716  
    717     // Construct the main body of the loop (the main body does not include 
    718     // the final storage instruction, because of the FST latency). 
    719     char [] mainbody = ""; 
    720     char [] firstbody = ""; 
    721  
    722     // We need to keep track of how many things are on the FPU stack. 
    723     // Every time something is pushed, the indices of our variables change! 
    724     int numOnStack = 0; // How much of the FP stack is being used? 
    725  
    726 if (operations.length>1) { 
    727     while(done<operations.length) { 
    728         char [] next; 
    729       if (isInstruction(operations[done])) { 
    730             // Perform an arithemetic operation on the top two FPU stack items. 
    731             next = "  " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 
    732             mainbody ~= next; firstbody ~= next; 
    733             ++done; 
    734             numOnStack--; 
    735       } else if (!isInstruction(operations[done+1])){ 
    736             // load a vector onto the FPU stack, to begin a new subexpression. 
    737             int u  = operations[done]-'a'; 
    738             next = "  fld "  ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 
    739             mainbody ~= next; firstbody ~= next; 
    740             ++done; 
    741             numOnStack++; 
    742       } else if (isVector(typelist[operations[done]-'a'])) { 
    743          // An operation will be performed between the stack top and a vector. 
    744          // If it's a float or double, we can combine the load+arithmetic op 
    745          // into a single instruction. 
    746          if (typelist[operations[done]-'a']=='R') { 
    747              // 80-bit vectors must be loaded onto the FPU stack first 
    748             next = "  fld real ptr ["  ~ vectorRegister[vectorNum(typelist, operations[done])] ~ "];\n" 
    749                 ~ "  " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; 
    750          } else { 
    751             next = "  " ~ opToX87(operations[done+1]) ~ " " 
    752               ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 
    753         } 
    754         mainbody ~= next; firstbody ~= next; 
    755         done +=2; 
    756       } else { // multiply by scalar. 
    757         if (typelist[operations[done]-'a']=='r') { 
    758              // Multiply by real scalar, which is already on the stack. Note that there's an extra item on the stack when we're in the body of the loop. 
    759             firstbody ~= "  fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - realScalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 
    760             mainbody ~= "  fmul ST, ST(" ~ itoa(1 + numOnStack + numScalarsOnStack - realScalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 
    761         } else { 
    762             // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 
    763             next = "  fmul " ~ operandSize(typelist[operations[done]-'a']) ~ "expr[" ~ itoa(operations[done]-'a') ~"];\n"; 
    764            mainbody ~= next; firstbody ~= next; 
    765        } 
    766             done +=2; 
    767       } 
    768     } 
    769 } else { // length = 1 
    770     char [] next; 
    771     if (typelist[$-1]=='R') 
    772         next = "  fld real ptr ["  ~ vectorRegister[0] ~ "];\n"; 
    773     else next = "  fld "~ operandSize(typelist[$-1]) ~ " [" 
    774         ~ vectorRegister[0] ~ " + " ~ vectorSize(typelist[$-1]) ~ "*ESI];\n"; 
    775     mainbody ~=next; firstbody~=next; 
    776     ++numOnStack; 
    777 } 
    778     // the last operation is special, because it may involve the 
    779     // destination vector (+=, -=, *=). 
    780     if (!isDotProduct && finaloperation.length>1) { 
    781         if (finaloperation[0]=='*') { 
    782             firstbody ~= "  fmul ST, ST(" ~ itoa(numOnStack) ~ ");"\n; 
    783             // +1 because previous result is also on stack 
    784             mainbody ~= "  fmul ST, ST(" ~ itoa(numOnStack+1) ~ ");"\n; 
    785         } else { 
    786             char [] finalop = "fadd"; 
    787             if (finaloperation[0]=='-') finalop="fsubr"; 
    788             char [] next; 
    789              if (typelist[$-1]=='R') { 
    790                  // 80-bit vectors must be loaded onto the FPU stack first 
    791                 next = "  fld real ptr ["  ~ vectorRegister[numvecs-1] ~ "];"\n; 
    792                 next ~= "  " ~ finalop ~ "p ST(1), ST;\n"; 
    793              } else { 
    794                 next = "  " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " 
    795                 ~ vectorSize(typelist[$-1]) ~ "*ESI];"\n; 
    796             } 
    797             mainbody ~=next; firstbody~=next; 
    798         } 
    799     } 
    800     result ~= \n ~  firstbody  ~ "  jmp short L2;\n" 
    801         ~ "  align 4;\n" ~ "L1:\n" ~ mainbody; 
    802  
    803     result ~= "  fxch ST(1), ST;\n"; // get previous result 
    804     if (isDotProduct) result ~= "  faddp ST(2), ST;"\n; 
    805     else { 
    806         result ~= storeVector(typelist[$-1], numvecs-1); 
    807     } 
    808  
    809     result ~= "L2: \n"; 
    810  
    811     // Update the counters 
    812     result~= incrementRealVectors ~ "  inc ESI;\n  jnz L1;\n"; 
    813  
    814     // Store the result from the final iteration 
    815     if (isDotProduct) result ~= "  faddp ST(1), ST;"\n; 
    816     else result ~= storeVector(typelist[$-1], numvecs-1); 
    817  
    818     // Discard any scalars that are left on the stack 
    819     if (isDotProduct && numScalarsOnStack>0) { 
    820         // Preserve the result of the dot product 
    821         result ~= "  fxch ST(" ~ itoa(numScalarsOnStack) ~ "), ST;"\n; 
    822     } 
    823     while (numScalarsOnStack>1) { 
    824         result~= "  fcompp ST(0), ST;"\n; // pop two values at once 
    825         numScalarsOnStack-=2; 
    826     } 
    827     if (numScalarsOnStack==1) result~= "  fstp ST(0), ST;"\n; 
    828  
    829  
    830     result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 
    831315    return result; 
    832316}