| 343 | | return result; |
|---|
| 344 | | } |
|---|
| 345 | | |
|---|
| 346 | | // ------------------------------------------------ |
|---|
| 347 | | // PART 4 -- Convert expression to postfix form |
|---|
| 348 | | // ------------------------------------------------ |
|---|
| 349 | | |
|---|
| 350 | | // return the length of a sub-expression |
|---|
| 351 | | int exprLength(char [] s) |
|---|
| 352 | | { |
|---|
| 353 | | if (s[0]=='#') return 1; |
|---|
| 354 | | int numParens=0; |
|---|
| 355 | | for (int i=0; i<s.length; ++i) { |
|---|
| 356 | | if (s[i]=='(') { numParens++; } |
|---|
| 357 | | if (s[i]==')') { numParens--; } |
|---|
| 358 | | if (numParens == 0) { return i; } |
|---|
| 359 | | } |
|---|
| 360 | | } |
|---|
| 361 | | |
|---|
| 362 | | // Converts an infix string into postfix. Also strips off the # symbols. |
|---|
| 363 | | // Apply x87-specific optimisations during the conversion. |
|---|
| 364 | | char [] makePostfixForX87(char [] operations, char [] typelist) |
|---|
| 365 | | { |
|---|
| 366 | | // if (operations.length==1) return operations; |
|---|
| 367 | | if (operations.length==2 && operations[0]=='#') return operations[1..$]; |
|---|
| 368 | | |
|---|
| 369 | | int x = exprLength(operations); |
|---|
| 370 | | char [] first = operations[0..x+1]; |
|---|
| 371 | | char [] second = operations[x+2..$]; |
|---|
| 372 | | if (first[0]=='(') { |
|---|
| 373 | | first = makePostfixForX87(first[1..$-1], typelist); |
|---|
| 374 | | } else if (first[0]=='#') first = operations[1..x+1]; |
|---|
| 375 | | if (second[0]=='(') { |
|---|
| 376 | | second = makePostfixForX87(second[1..$-1], typelist); |
|---|
| 377 | | } else if (second[0]=='#') second = operations[x+3..$]; |
|---|
| 378 | | |
|---|
| 379 | | // x87 OPTIMISATION #1 |
|---|
| 380 | | // On x87, fmul has a long latency, so we want to delay using the |
|---|
| 381 | | // result of a multiply. Since + is commutative, we can achieve this |
|---|
| 382 | | // by calculating the value with the multiply, before the other one. |
|---|
| 383 | | // Note that there a few cases that could still be improved, eg with |
|---|
| 384 | | // ((a*b)+(c*d))+(e*f), all three multiplies could be performed |
|---|
| 385 | | // before any of the additions. This would require stack rotation |
|---|
| 386 | | // operations (can't be done with simple postfix), greatly increasing the |
|---|
| 387 | | // complexity of the mini-compiler.). |
|---|
| 388 | | if (operations[x+1]=='+') { |
|---|
| 389 | | if (second[$-1]=='*' && first[$-1]!='*') { |
|---|
| 390 | | return second ~ first ~ operations[x+1..x+2]; |
|---|
| 391 | | } |
|---|
| 392 | | } |
|---|
| 393 | | // We can also do the same thing with -, but we'll need to use fsubr |
|---|
| 394 | | // instead of fsub. We use _ to mean reversed subtraction. |
|---|
| 395 | | if (operations[x+1]=='-') { |
|---|
| 396 | | if (second[$-1]=='*' && first[$-1]!='*') { |
|---|
| 397 | | return second ~ first ~ "_"; |
|---|
| 398 | | } |
|---|
| 399 | | } |
|---|
| 400 | | // x87 OPTIMISATION #2 |
|---|
| 401 | | // When an operation is performed between a real[] and a non-real[], |
|---|
| 402 | | // we want to have the real[] being the one which is loaded first. |
|---|
| 403 | | if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='+') { |
|---|
| 404 | | return second ~ first ~ "+"; |
|---|
| 405 | | } |
|---|
| 406 | | if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='-') { |
|---|
| 407 | | return second ~ first ~ "_"; |
|---|
| 408 | | } |
|---|
| 409 | | return first ~ second ~ operations[x+1..x+2]; |
|---|
| 410 | | } |
|---|
| 411 | | |
|---|
| 412 | | // ------------------------------- |
|---|
| 413 | | // PART 5 -- Mixins to generate x87 ASM code |
|---|
| 414 | | // ------------------------------- |
|---|
| 415 | | |
|---|
| 416 | | bool isInstruction(char op) |
|---|
| 417 | | { |
|---|
| 418 | | return (op=='+' || op=='*' || op=='-'|| op=='.'|| op=='_'); |
|---|
| 419 | | } |
|---|
| 420 | | |
|---|
| 421 | | // Count the number of vectors in the typestring |
|---|
| 422 | | int countVectors(char [] typelist) |
|---|
| 423 | | { |
|---|
| 424 | | int numVecs=0; |
|---|
| 425 | | for (int i=0; i<typelist.length; ++i) { |
|---|
| 426 | | if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F' || typelist[i]=='Z') ++numVecs; |
|---|
| 427 | | } |
|---|
| 428 | | return numVecs; |
|---|
| 429 | | } |
|---|
| 430 | | |
|---|
| 431 | | // Count the number of temporaries which occur in the postfix expression. |
|---|
| 432 | | int countTemporaries(char [] postfix) |
|---|
| 433 | | { |
|---|
| 434 | | // A temporary occurs whenever we load two values without an operation performed on the |
|---|
| 435 | | // first one. |
|---|
| 436 | | int numTemps=0; |
|---|
| 437 | | for (int i=1; i<postfix.length; ++i) { |
|---|
| 438 | | if (!isInstruction(postfix[i-1]) && !isInstruction(postfix[i])) numTemps++; |
|---|
| 439 | | } |
|---|
| 440 | | return numTemps; |
|---|
| 441 | | } |
|---|
| 442 | | |
|---|
| 443 | | // The maximum number of simultaneous temporary values in the postfix expression. |
|---|
| 444 | | int maxActiveTemporaries(char [] postfix) |
|---|
| 445 | | { |
|---|
| 446 | | int maxTemps=0; |
|---|
| 447 | | int numTemps=0; |
|---|
| 448 | | for (int i=1; i<postfix.length; ++i) { |
|---|
| 449 | | if (!isInstruction(postfix[i-1]) && !isInstruction(postfix[i])) numTemps++; |
|---|
| 450 | | if (isInstruction(postfix[i])) numTemps--; |
|---|
| 451 | | if (maxTemps<numTemps) maxTemps=numTemps; |
|---|
| 452 | | } |
|---|
| 453 | | return maxTemps; |
|---|
| 454 | | |
|---|
| 455 | | } |
|---|
| 456 | | |
|---|
| 457 | | int vectorNum(char [] typelist, char var) |
|---|
| 458 | | { |
|---|
| 459 | | int numVecs=0; |
|---|
| 460 | | for (int i=0; i<var-'a'; ++i) { |
|---|
| 461 | | if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F' || typelist[i]=='Z') ++numVecs; |
|---|
| 462 | | } |
|---|
| 463 | | return numVecs; |
|---|
| 464 | | } |
|---|
| 465 | | |
|---|
| 466 | | int realScalarNum(char [] typelist, char var) |
|---|
| 467 | | { |
|---|
| 468 | | int k=0; |
|---|
| 469 | | for (int i=0; i<var-'a'; ++i) { |
|---|
| 470 | | if (typelist[i]=='r') ++k; |
|---|
| 471 | | } |
|---|
| 472 | | return k; |
|---|
| 473 | | } |
|---|
| 474 | | |
|---|
| 475 | | |
|---|
| 476 | | char [] operandSize(char var) |
|---|
| 477 | | { |
|---|
| 478 | | switch(var) { |
|---|
| 479 | | case 'r': |
|---|
| 480 | | case 'R': return "real ptr "; |
|---|
| 481 | | case 'd': |
|---|
| 482 | | case 'D': return "double ptr "; |
|---|
| 483 | | case 'f': |
|---|
| 484 | | case 'F': return "float ptr "; |
|---|
| 485 | | } |
|---|
| 486 | | } |
|---|
| 487 | | |
|---|
| 488 | | char [] opToX87(char op) |
|---|
| 489 | | { |
|---|
| 490 | | switch (op) { |
|---|
| 491 | | case '*': |
|---|
| 492 | | case '.': return "fmul"; |
|---|
| 493 | | case '+': return "fadd"; |
|---|
| 494 | | case '-': return "fsub"; |
|---|
| 495 | | case '_': return "fsubr"; |
|---|
| 496 | | } |
|---|
| 497 | | } |
|---|
| 498 | | |
|---|
| 499 | | static if (real.sizeof==10) const char [] REALSIZE = "10"; |
|---|
| 500 | | else static if (real.sizeof==12) const char [] REALSIZE = "12"; |
|---|
| 501 | | else static if (real.sizeof==16) const char [] REALSIZE = "16"; |
|---|
| 502 | | |
|---|
| 503 | | char [] vectorSize(char vartype) |
|---|
| 504 | | { |
|---|
| 505 | | switch (vartype) { |
|---|
| 506 | | case 'D': return "8"; |
|---|
| 507 | | case 'F': return "4"; |
|---|
| 508 | | case 'R': return REALSIZE; |
|---|
| 509 | | } |
|---|
| 510 | | } |
|---|
| 511 | | |
|---|
| 512 | | // First, use the scratch registers (EAX, ECX, EDX). If that's not enough, |
|---|
| 513 | | // use EBX, ESI, and EDI. Finally, use the frame register EBP. |
|---|
| 514 | | const char [][5] vectorRegister = ["EAX", "ECX", "EDX", "EBX", "EDI"]; |
|---|
| 515 | | |
|---|
| 516 | | // Is this expression simple enough for the x87 code generator? |
|---|
| 517 | | bool isX87AsmPossible(char [] typelist, char [] operations) { |
|---|
| 518 | | version (D_InlineAsm_X86) { |
|---|
| 519 | | // Are there enough index registers? |
|---|
| 520 | | if (countVectors(typelist) > vectorRegister.length) return false; |
|---|
| 521 | | // Does it contain any types we can't deal with? |
|---|
| 522 | | foreach(ch; typelist) { |
|---|
| 523 | | // can only do float, double, and 80-bit vectors, and scalars. |
|---|
| 524 | | if (ch!='R' && ch!='D' && ch!='F' && ch!='r' && ch!='d' && ch!='s') return false; |
|---|
| 525 | | } |
|---|
| 526 | | // BUG: should also check if it will overflow the FPU stack |
|---|
| 527 | | return true; |
|---|
| 528 | | } else { |
|---|
| 529 | | // Without an assembler, there's no chance! |
|---|
| 530 | | return false; |
|---|
| 531 | | } |
|---|
| 532 | | } |
|---|
| 533 | | |
|---|
| 534 | | // Is this expression simple enough for the SSE2 code generator? |
|---|
| 535 | | bool isSSE2AsmPossible(char [] typelist, char [] operations) |
|---|
| 536 | | { |
|---|
| 537 | | version (D_InlineAsm_X86) { |
|---|
| 538 | | // Does it contain any types we can't deal with? |
|---|
| 539 | | foreach(ch; typelist) { |
|---|
| 540 | | // can only do double vectors and double scalars. |
|---|
| 541 | | if (ch!='D' && ch!='d') return false; |
|---|
| 542 | | } |
|---|
| 543 | | return false; // not yet implemented |
|---|
| 544 | | } else { |
|---|
| 545 | | // Without an assembler, there's no chance! |
|---|
| 546 | | return false; |
|---|
| 547 | | } |
|---|
| 548 | | } |
|---|
| 549 | | |
|---|
| 550 | | // Create code to push all used vector registors. |
|---|
| 551 | | char [] pushRegisters(int numVectors) |
|---|
| 552 | | { |
|---|
| 553 | | char [] result = " push ESI;"; |
|---|
| 554 | | for (int i=3; i<numVectors; ++i) result~= " push " ~ vectorRegister[i] ~ ";"; |
|---|
| 555 | | return result ~ "\n"; |
|---|
| 556 | | } |
|---|
| 557 | | |
|---|
| 558 | | // Create code to pop all used vector registors. |
|---|
| 559 | | char [] popRegisters(int numVectors) |
|---|
| 560 | | { |
|---|
| 561 | | char [] result = " "; |
|---|
| 562 | | for (int i=numVectors-1; i>=3; --i) result~= "pop " ~ vectorRegister[i] ~ "; "; |
|---|
| 563 | | return result ~ "pop ESI;\n"; |
|---|
| 564 | | } |
|---|
| 565 | | |
|---|
| 566 | | char [] indexedVector(char [] typelist, char var) |
|---|
| 567 | | { |
|---|
| 568 | | if (typelist[var-'a']=='R') return " real ptr [" ~ vectorRegister[vectorNum(typelist, var)] ~ "]"; |
|---|
| 569 | | return operandSize(typelist[var-'a']) ~ "[" ~ |
|---|
| 570 | | vectorRegister[vectorNum(typelist, var)] ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; |
|---|
| 571 | | } |
|---|
| 572 | | |
|---|
| 573 | | char [] storeVector(char type, int vecnum) |
|---|
| 574 | | { |
|---|
| 575 | | char [] stride = " - " ~ vectorSize(type); |
|---|
| 576 | | if (type=='R') { |
|---|
| 577 | | return " fstp real ptr [" ~ vectorRegister[vecnum] ~ stride ~ "];"\n; |
|---|
| 578 | | } else { |
|---|
| 579 | | return " fstp " ~ operandSize(type) ~ " [" ~ vectorRegister[vecnum] ~ " + " ~ vectorSize(type)~ "*ESI" ~ stride ~ "];"\n; |
|---|
| 580 | | } |
|---|
| 581 | | } |
|---|
| 582 | | |
|---|
| 583 | | char [] opToSSE2(char op) |
|---|
| 584 | | { |
|---|
| 585 | | switch (op) { |
|---|
| 586 | | case '*': |
|---|
| 587 | | case '.': return "mulpd"; |
|---|
| 588 | | case '+': return "addpd"; |
|---|
| 589 | | case '-': return "subpd"; |
|---|
| 590 | | case '_': return "**BUG**"; // Non-existent! |
|---|
| 591 | | } |
|---|
| 592 | | } |
|---|
| 593 | | |
|---|
| 594 | | char [] opToSSE(char op) |
|---|
| 595 | | { |
|---|
| 596 | | switch (op) { |
|---|
| 597 | | case '*': |
|---|
| 598 | | case '.': return "mulps"; |
|---|
| 599 | | case '+': return "addps"; |
|---|
| 600 | | case '-': return "subps"; |
|---|
| 601 | | case '_': return "**BUG**"; // Non-existent! |
|---|
| 602 | | } |
|---|
| 603 | | } |
|---|
| 604 | | |
|---|
| 605 | | |
|---|
| 606 | | char [] generateCodeForAsmSSE2(int knownlength, char [] typelist, char [] operations, char [] finaloperation) |
|---|
| 607 | | { |
|---|
| 608 | | // Use ESI as the index register. |
|---|
| 609 | | char [] result="asm {"\n |
|---|
| 610 | | ~"L1: \n" |
|---|
| 611 | | ~ " movapd XMM1, [ESI+EAX];"\n |
|---|
| 612 | | ~ " mulpd XMM1, XMM2;"\n |
|---|
| 613 | | ~ " addpd XMM1, [EDI+ESI];"\n |
|---|
| 614 | | ~ " movapd [EDI+ESI], XMM1;"\n |
|---|
| 615 | | ~ " add ESI, 16;"\n |
|---|
| 616 | | ~ " js L1;"\n |
|---|
| 617 | | ~ "}"\n; |
|---|
| 618 | | return result; |
|---|
| 619 | | } |
|---|
| 620 | | |
|---|
| 621 | | /** Generate asm code which is optimal for x87 CPUs without SSE2. |
|---|
| 622 | | (Pentium, PMMX, PII, PIII). It is also optimal for recent x86 CPUs |
|---|
| 623 | | where vector sizes are mixed. |
|---|
| 624 | | The key optimisation rules are: |
|---|
| 625 | | 1. keep the loop overhead to one clock cycle if possible. |
|---|
| 626 | | 2. (FMUL latency) don't use the result of a multiply immediately |
|---|
| 627 | | 3. (FST latency) don't save a value to memory immediately after it's calculated. |
|---|
| 628 | | 4. (AGI stall) don't use the counter variable immediately after it's modified. |
|---|
| 629 | | Techniques to address these are: |
|---|
| 630 | | 1. Use ESI as a counter and index variable, which begins negative and counts UP to zero. |
|---|
| 631 | | 2. The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. |
|---|
| 632 | | 3. The latency of fstp is avoided by calculating a result in one iteration, |
|---|
| 633 | | but not storing it to memory until the subsequent iteration. |
|---|
| 634 | | 4. (NOT YET IMPLEMENTED): first operation in the loop should be loading a scalar (for a multiply), |
|---|
| 635 | | if possible, otherwise load an 80-bit vector, if possible. |
|---|
| 636 | | |
|---|
| 637 | | The generated code is of the form: |
|---|
| 638 | | ---- |
|---|
| 639 | | load scalars onto FPU stack |
|---|
| 640 | | load vector pointers into EAX, EBX, ... |
|---|
| 641 | | calculate result[0] into ST(0) |
|---|
| 642 | | goto L2 |
|---|
| 643 | | L1: |
|---|
| 644 | | calculate result[i+1] into ST(0) |
|---|
| 645 | | swap so that result[i] is in ST(0) |
|---|
| 646 | | L2: |
|---|
| 647 | | store result[i] |
|---|
| 648 | | increment pointers, goto L1 if i<n-1 |
|---|
| 649 | | store result[n-1] |
|---|
| 650 | | pop scalars off FPU stack |
|---|
| 651 | | ---- |
|---|
| 652 | | |
|---|
| 653 | | */ |
|---|
| 654 | | char [] generateCodeForAsmX87(int knownlength, char [] typelist, char [] operations, char [] finaloperation) |
|---|
| 655 | | { |
|---|
| 656 | | char [] result=""; |
|---|
| 657 | | char [] incrementRealVectors=""; |
|---|
| 658 | | |
|---|
| 659 | | // Create local variables for pointers to vectors (avoid bug #1125) |
|---|
| 660 | | int vecnum = 0; |
|---|
| 661 | | for (int i=0; i< typelist.length;++i) { |
|---|
| 662 | | if (isVector(typelist[i])){ |
|---|
| 663 | | result~= " auto vec" ~ itoa(i) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; |
|---|
| 664 | | if (typelist[i]=='R') { |
|---|
| 665 | | incrementRealVectors ~= " add " ~ vectorRegister[vecnum] ~ ", " ~ REALSIZE ~ ";\n"; |
|---|
| 666 | | } |
|---|
| 667 | | ++vecnum; |
|---|
| 668 | | } |
|---|
| 669 | | } |
|---|
| 670 | | if (knownlength==0) { |
|---|
| 671 | | result ~= " int veclength = expr[" ~itoa(findFirstVector(typelist)) ~"].length;\n"; |
|---|
| 672 | | } |
|---|
| 673 | | |
|---|
| 674 | | bool isDotProduct = (operations[$-1]=='.'); |
|---|
| 675 | | int numScalarsOnStack=0; |
|---|
| 676 | | |
|---|
| 677 | | result~= \n"asm {"\n ~ pushRegisters(vecnum); |
|---|
| 678 | | // ESI will be the counter |
|---|
| 679 | | if (knownlength>0) result~= " mov ESI, " ~ itoa(knownlength) ~";\n"; |
|---|
| 680 | | else result ~= " mov ESI, veclength;"\n; |
|---|
| 681 | | |
|---|
| 682 | | // Load all the vector pointers into registers, and push all the scalars onto the stack |
|---|
| 683 | | |
|---|
| 684 | | int numvecs=0; |
|---|
| 685 | | int numconsts=0; |
|---|
| 686 | | for (int i=0; i<typelist.length; ++i) { |
|---|
| 687 | | if (isVector(typelist[i])) { |
|---|
| 688 | | if (typelist[i]=='R') { |
|---|
| 689 | | result ~= " mov " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; |
|---|
| 690 | | } else { |
|---|
| 691 | | result ~= " lea " ~ vectorRegister[numvecs] |
|---|
| 692 | | ~ ", [" ~ vectorSize(typelist[i]) ~ "*ESI]; " |
|---|
| 693 | | ~ " add " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; |
|---|
| 694 | | } |
|---|
| 695 | | ++numvecs; |
|---|
| 696 | | } else if (typelist[i]=='r') { |
|---|
| 697 | | result ~= " fld real ptr expr["~ itoa(i) ~"];\n"; |
|---|
| 698 | | ++numconsts; |
|---|
| 699 | | ++numScalarsOnStack; |
|---|
| 700 | | } |
|---|
| 701 | | } |
|---|
| 702 | | |
|---|
| 703 | | if (isDotProduct) result ~= " fldz;"\n; |
|---|
| 704 | | if (knownlength>0) result~= " mov ESI, -" ~ itoa(knownlength) ~";\n"; |
|---|
| 705 | | else { |
|---|
| 706 | | result ~= " xor ESI, ESI; "\n |
|---|
| 707 | | " sub ESI, veclength; // counter=-length"\n |
|---|
| 708 | | " jz short L3; // test for length==0"\n; |
|---|
| 709 | | } |
|---|
| 710 | | if (!isDotProduct && operations.length==1 && finaloperation[0]=='*') { |
|---|
| 711 | | ++numScalarsOnStack; |
|---|
| 712 | | // load multiplier for *= |
|---|
| 713 | | result ~= " fld double ptr expr[0];\n"; |
|---|
| 714 | | } |
|---|
| 715 | | int done=0; |
|---|
| 716 | | |
|---|
| 717 | | // Construct the main body of the loop (the main body does not include |
|---|
| 718 | | // the final storage instruction, because of the FST latency). |
|---|
| 719 | | char [] mainbody = ""; |
|---|
| 720 | | char [] firstbody = ""; |
|---|
| 721 | | |
|---|
| 722 | | // We need to keep track of how many things are on the FPU stack. |
|---|
| 723 | | // Every time something is pushed, the indices of our variables change! |
|---|
| 724 | | int numOnStack = 0; // How much of the FP stack is being used? |
|---|
| 725 | | |
|---|
| 726 | | if (operations.length>1) { |
|---|
| 727 | | while(done<operations.length) { |
|---|
| 728 | | char [] next; |
|---|
| 729 | | if (isInstruction(operations[done])) { |
|---|
| 730 | | // Perform an arithemetic operation on the top two FPU stack items. |
|---|
| 731 | | next = " " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; |
|---|
| 732 | | mainbody ~= next; firstbody ~= next; |
|---|
| 733 | | ++done; |
|---|
| 734 | | numOnStack--; |
|---|
| 735 | | } else if (!isInstruction(operations[done+1])){ |
|---|
| 736 | | // load a vector onto the FPU stack, to begin a new subexpression. |
|---|
| 737 | | int u = operations[done]-'a'; |
|---|
| 738 | | next = " fld " ~ indexedVector(typelist, operations[done] ) ~ ";\n"; |
|---|
| 739 | | mainbody ~= next; firstbody ~= next; |
|---|
| 740 | | ++done; |
|---|
| 741 | | numOnStack++; |
|---|
| 742 | | } else if (isVector(typelist[operations[done]-'a'])) { |
|---|
| 743 | | // An operation will be performed between the stack top and a vector. |
|---|
| 744 | | // If it's a float or double, we can combine the load+arithmetic op |
|---|
| 745 | | // into a single instruction. |
|---|
| 746 | | if (typelist[operations[done]-'a']=='R') { |
|---|
| 747 | | // 80-bit vectors must be loaded onto the FPU stack first |
|---|
| 748 | | next = " fld real ptr [" ~ vectorRegister[vectorNum(typelist, operations[done])] ~ "];\n" |
|---|
| 749 | | ~ " " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; |
|---|
| 750 | | } else { |
|---|
| 751 | | next = " " ~ opToX87(operations[done+1]) ~ " " |
|---|
| 752 | | ~ indexedVector(typelist, operations[done] ) ~ ";\n"; |
|---|
| 753 | | } |
|---|
| 754 | | mainbody ~= next; firstbody ~= next; |
|---|
| 755 | | done +=2; |
|---|
| 756 | | } else { // multiply by scalar. |
|---|
| 757 | | if (typelist[operations[done]-'a']=='r') { |
|---|
| 758 | | // Multiply by real scalar, which is already on the stack. Note that there's an extra item on the stack when we're in the body of the loop. |
|---|
| 759 | | firstbody ~= " fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - realScalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; |
|---|
| 760 | | mainbody ~= " fmul ST, ST(" ~ itoa(1 + numOnStack + numScalarsOnStack - realScalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; |
|---|
| 761 | | } else { |
|---|
| 762 | | // For scalar float or double values, we can multiply directly, saving one slot on the FP stack. |
|---|
| 763 | | next = " fmul " ~ operandSize(typelist[operations[done]-'a']) ~ "expr[" ~ itoa(operations[done]-'a') ~"];\n"; |
|---|
| 764 | | mainbody ~= next; firstbody ~= next; |
|---|
| 765 | | } |
|---|
| 766 | | done +=2; |
|---|
| 767 | | } |
|---|
| 768 | | } |
|---|
| 769 | | } else { // length = 1 |
|---|
| 770 | | char [] next; |
|---|
| 771 | | if (typelist[$-1]=='R') |
|---|
| 772 | | next = " fld real ptr [" ~ vectorRegister[0] ~ "];\n"; |
|---|
| 773 | | else next = " fld "~ operandSize(typelist[$-1]) ~ " [" |
|---|
| 774 | | ~ vectorRegister[0] ~ " + " ~ vectorSize(typelist[$-1]) ~ "*ESI];\n"; |
|---|
| 775 | | mainbody ~=next; firstbody~=next; |
|---|
| 776 | | ++numOnStack; |
|---|
| 777 | | } |
|---|
| 778 | | // the last operation is special, because it may involve the |
|---|
| 779 | | // destination vector (+=, -=, *=). |
|---|
| 780 | | if (!isDotProduct && finaloperation.length>1) { |
|---|
| 781 | | if (finaloperation[0]=='*') { |
|---|
| 782 | | firstbody ~= " fmul ST, ST(" ~ itoa(numOnStack) ~ ");"\n; |
|---|
| 783 | | // +1 because previous result is also on stack |
|---|
| 784 | | mainbody ~= " fmul ST, ST(" ~ itoa(numOnStack+1) ~ ");"\n; |
|---|
| 785 | | } else { |
|---|
| 786 | | char [] finalop = "fadd"; |
|---|
| 787 | | if (finaloperation[0]=='-') finalop="fsubr"; |
|---|
| 788 | | char [] next; |
|---|
| 789 | | if (typelist[$-1]=='R') { |
|---|
| 790 | | // 80-bit vectors must be loaded onto the FPU stack first |
|---|
| 791 | | next = " fld real ptr [" ~ vectorRegister[numvecs-1] ~ "];"\n; |
|---|
| 792 | | next ~= " " ~ finalop ~ "p ST(1), ST;\n"; |
|---|
| 793 | | } else { |
|---|
| 794 | | next = " " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " |
|---|
| 795 | | ~ vectorSize(typelist[$-1]) ~ "*ESI];"\n; |
|---|
| 796 | | } |
|---|
| 797 | | mainbody ~=next; firstbody~=next; |
|---|
| 798 | | } |
|---|
| 799 | | } |
|---|
| 800 | | result ~= \n ~ firstbody ~ " jmp short L2;\n" |
|---|
| 801 | | ~ " align 4;\n" ~ "L1:\n" ~ mainbody; |
|---|
| 802 | | |
|---|
| 803 | | result ~= " fxch ST(1), ST;\n"; // get previous result |
|---|
| 804 | | if (isDotProduct) result ~= " faddp ST(2), ST;"\n; |
|---|
| 805 | | else { |
|---|
| 806 | | result ~= storeVector(typelist[$-1], numvecs-1); |
|---|
| 807 | | } |
|---|
| 808 | | |
|---|
| 809 | | result ~= "L2: \n"; |
|---|
| 810 | | |
|---|
| 811 | | // Update the counters |
|---|
| 812 | | result~= incrementRealVectors ~ " inc ESI;\n jnz L1;\n"; |
|---|
| 813 | | |
|---|
| 814 | | // Store the result from the final iteration |
|---|
| 815 | | if (isDotProduct) result ~= " faddp ST(1), ST;"\n; |
|---|
| 816 | | else result ~= storeVector(typelist[$-1], numvecs-1); |
|---|
| 817 | | |
|---|
| 818 | | // Discard any scalars that are left on the stack |
|---|
| 819 | | if (isDotProduct && numScalarsOnStack>0) { |
|---|
| 820 | | // Preserve the result of the dot product |
|---|
| 821 | | result ~= " fxch ST(" ~ itoa(numScalarsOnStack) ~ "), ST;"\n; |
|---|
| 822 | | } |
|---|
| 823 | | while (numScalarsOnStack>1) { |
|---|
| 824 | | result~= " fcompp ST(0), ST;"\n; // pop two values at once |
|---|
| 825 | | numScalarsOnStack-=2; |
|---|
| 826 | | } |
|---|
| 827 | | if (numScalarsOnStack==1) result~= " fstp ST(0), ST;"\n; |
|---|
| 828 | | |
|---|
| 829 | | |
|---|
| 830 | | result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; |
|---|