Changeset 88
- Timestamp:
- 03/21/07 07:01:22 (2 years ago)
- Files:
-
- trunk/mathextra/Blade.d (modified) (12 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/mathextra/Blade.d
r87 r88 8 8 * - Supports any mix of vector addition, subtraction, dot product, and multiplication 9 9 * by a scalar. 10 * - 80-bit precision is used whereever possible (and it does not reduce execturion speed). 10 11 * - Supports mixed-length operations (eg, real[] + double[] + float[]). 11 12 * - When static arrays are used, mismatches in array length are detected at compile time. … … 17 18 * 18 19 * BUGS: 19 * - Not well tested. 20 * - Not optimal for 80-bit vectors (can save an instruction by changing float + real into real+float).20 * - Not well tested. No unit tests are included! 21 * - Not always optimal for 80-bit vectors (can save an instruction by changing float[] + real[] into real[]+float[]). 21 22 * - Not optimal for the case of multiple real vectors (they could share a counter). 22 * - Doesn't support multiply by scalar real (need to keep track of the FP stack to do this). 23 * - Doesn't yet support all combinations (eg vector*=scalar). 23 * - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required). 24 24 * - Doesn't take advantage of length being known at compile time (loop unrolling 25 25 * is possible). … … 92 92 // ... and add it to all variables in the second expression, adding 93 93 // parentheses if required. 94 if (second .length==1) return ret ~ op ~ cast(char)(nextarg+1);94 if (second=="a") return ret ~ op ~ cast(char)(nextarg+1); 95 95 ret ~= op ~ "("; 96 96 for (int i=0; i<second.length; ++i) { … … 124 124 static if (operations[$-2]=='*') { 125 125 // Already a scalar multiply, so constant fold it. 126 VectorExpr!(operations, knownlength, B) opMul( doublex) {126 VectorExpr!(operations, knownlength, B) opMul(real x) { 127 127 return VectorExpr!(operations, knownlength, B)(values[0..$-1], x*values[$-1]); 128 128 } 129 129 } else { 130 JoinResult!("*", "a", double) opMul(doublex) {131 return JoinResult!("*", "a", double)(values, x);130 JoinResult!("*", "a", real) opMul(real x) { 131 return JoinResult!("*", "a", real)(values, x); 132 132 } 133 133 } … … 168 168 performOperation!(expr.ops, "-=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values); 169 169 } 170 void opMulAssign(A)(A w) { // We don't want to generate this code unless it's actually used. 171 static assert(is(A: double)); 172 performOperation!("a", "*=", knownlength, double, X[])(w, values); 173 } 174 VectorExpr!("a*b", knownlength, X[], double) opMul(double w) { 175 return VectorExpr!("a*b", knownlength, X[], double)(values, w); 170 void opMulAssign(A)(A w) { // Use a template, because we don't want to generate this code 171 // unless it's actually used. 172 static assert(is(A: real)); 173 performOperation!("a", "*=", knownlength, real, X[])(w, values); 174 } 175 VectorExpr!("a*b", knownlength, X[], real) opMul(real w) { 176 return VectorExpr!("a*b", knownlength, X[], real)(values, w); 176 177 } 177 178 VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[]) opAdd(int T, Q)(Q[T] w) { … … 253 254 else static if (is(A == double[])) const char [] singleType = "D"; 254 255 else static if (is(A == float[])) const char [] singleType = "F"; 255 else static if (is(A == double)) const char [] singleType = "S";256 else static if (is(A == real)) const char [] singleType = "S"; 256 257 else const char [] singleType = "?"; 257 258 } … … 274 275 275 276 276 bool isVector(char [] typelist, char var) 277 { 278 int i = var-'a'; 279 return (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F'); 277 bool isVector(char var) 278 { 279 return (var=='R' || var=='D' || var=='F'); 280 280 } 281 281 … … 338 338 } 339 339 340 // First, use the registers that don't need to be preserved. 340 int scalarNum(char [] typelist, char var) 341 { 342 int k=0; 343 for (int i=0; i<var-'a'; ++i) { 344 if (typelist[i]=='S') ++k; 345 } 346 return k; 347 } 348 349 // First, use the scratch registers (EAX, ECX, EDX). If that's not enough, 350 // use EBX, ESI, and EDI. Finally, use the frame register EBP. 341 351 const char [][5] VectorRegisters = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 342 352 … … 370 380 else static if (real.sizeof==12) const char [] REALSIZE="12"; 371 381 else static if (real.sizeof==16) const char [] REALSIZE="16"; 382 383 char [] storeVector(char type, int vecnum, char [] stride="") 384 { 385 if (type=='R') { 386 return " fstp real ptr [" ~ vectorRegister(vecnum) ~ stride ~ "];"\n; 387 } else { 388 return " fstp " ~ operandSize(type) ~ " [" ~ vectorRegister(vecnum) ~ " + " ~ vectorSize_LEA(type) ~ stride ~ "];"\n; 389 } 390 } 372 391 373 392 // Generate asm code which is optimal for x87 CPUs without SSE2 … … 385 404 bool firstvec=true; 386 405 int vecnum = 0; 406 int scalarnum = 0; 387 407 for (int i=0; i< typelist.length;++i) { 388 408 if (typelist[i]=='S'){ 389 result~= " auto var" ~ itoa(i) ~ " = expr[" ~ itoa(i) ~ "];\n"; 409 result~= " real var" ~ itoa(i) ~ " = expr[" ~ itoa(i) ~ "];\n"; 410 ++scalarnum; 390 411 } else { 391 result~= " auto vec" ~ itoa( vecnum) ~ " = expr[" ~itoa(i) ~"].ptr;\n";412 result~= " auto vec" ~ itoa(i) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; 392 413 if (typelist[i]=='R') { 393 414 incrementRealVectors ~= " add " ~ vectorRegister(vecnum) ~ ", " ~ REALSIZE ~ ";\n"; … … 401 422 } 402 423 assert(vecnum-1 < VectorRegisters.length, "Too many vectors!"); 403 404 424 bool isDotProduct = (operations[$-1]=='.'); 425 int numScalarsOnStack=0; 405 426 406 427 result~= \n"asm {"\n ~ pushRegisters(vecnum) ~ 407 428 " mov ESI, veclength;"\n; // ESI will be the counter 408 429 409 // Load all the vector pointers into registers. 430 431 // Load all the vector pointers into registers, and the scalars onto the stack 410 432 int numvecs=0; 433 int numconsts=0; 411 434 for (int i=0; i<typelist.length; ++i) { 412 if (isVector(typelist , i+'a')) {413 if ( isRealVector(typelist, i+'a')) {414 result ~= " mov " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa(numvecs) ~ ";"\n;435 if (isVector(typelist[i])) { 436 if (typelist[i]=='R') { 437 result ~= " mov " ~ vectorRegister(numvecs) ~ ", dword ptr vec" ~ itoa(i) ~ ";"\n; 415 438 } else { 416 439 result ~= " lea " ~ vectorRegister(numvecs) ~ ", [" 417 ~ vectorSize_LEA(typelist[i]) ~ "]; "\n418 ~ " add " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa( numvecs) ~ ";"\n;440 ~ vectorSize_LEA(typelist[i]) ~ "]; " 441 ~ " add " ~ vectorRegister(numvecs) ~ ", vec" ~ itoa(i) ~ ";"\n; 419 442 } 420 443 ++numvecs; 444 } else { 445 result ~= " fld real ptr var"~ itoa(i) ~";\n"; 446 ++numconsts; 447 ++numScalarsOnStack; 421 448 } 422 449 } 423 if (isDotProduct) result ~= " fldz;"\n; 450 451 if (isDotProduct) result ~= " fldz;"\n; 424 452 result ~= " xor ESI, ESI; "\n 425 453 " sub ESI, veclength; // counter=-length"\n 426 454 " jz short L3; // test for length==0"\n; 427 455 if (!isDotProduct && operations.length==1 && finaloperation[0]=='*') { 456 ++numScalarsOnStack; 457 // load multiplier for *= 458 result ~= " fld double ptr var0;\n"; 459 } 428 460 int done=0; 429 461 430 462 char [] mainbody = ""; 463 char [] firstbody = ""; 464 int numOnStack = 0; // How much of the FP stack is being used? 431 465 432 466 if (operations.length>1) { 433 467 while(done<operations.length) { 468 char [] next; 434 469 if (isInstruction(operations[done])) { 435 mainbody ~= " " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 470 next = " " ~ opToX87(operations[done]) ~ "p ST(1), ST;"\n; 471 mainbody ~= next; firstbody ~= next; 436 472 ++done; 473 numOnStack--; 437 474 } else if (!isInstruction(operations[done+1])){ 438 475 int u = operations[done]-'a'; 439 mainbody ~= " fld " ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 476 next = " fld " ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 477 mainbody ~= next; firstbody ~= next; 440 478 ++done; 441 } else if (isVector(typelist, operations[done])) { 442 if (isRealVector(typelist, operations[done])) { 479 numOnStack++; 480 } else if (isVector(typelist[operations[done]-'a'])) { 481 if (typelist[operations[done]-'a']=='R') { 443 482 // 80-bit vectors must be loaded onto the FPU stack first 444 mainbody ~= " fld real ptr [" ~ vectorRegister(vectorNum(typelist, operations[done])) ~ "];\n";445 mainbody ~=" " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n";483 next = " fld real ptr [" ~ vectorRegister(vectorNum(typelist, operations[done])) ~ "];\n" 484 ~ " " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; 446 485 } else { 447 mainbody ~= " " ~ opToX87(operations[done+1]) ~ " "486 next = " " ~ opToX87(operations[done+1]) ~ " " 448 487 ~ indexedVector(typelist, operations[done] ) ~ ";\n"; 449 488 } 489 mainbody ~= next; firstbody ~= next; 450 490 done +=2; 451 491 } else { // multiply by scalar 452 mainbody ~= " " ~ opToX87(operations[done+1]) ~ " double ptr var" ~ itoa(operations[done]-'a') ~";\n"; 492 firstbody ~= " fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 493 mainbody ~= " fmul ST, ST(" ~ itoa(1 + numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 494 // NOTE: For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 495 // next = " " ~ opToX87(operations[done+1]) ~ " double ptr var" ~ itoa(operations[done]-'a') ~";\n"; 496 // mainbody ~= next; firstbody ~= next; 453 497 done +=2; 454 498 } 455 499 } 500 } else { 501 char [] next; 502 if (typelist[$-1]=='R') 503 next = " fld real ptr [" ~ vectorRegister(0) ~ "];\n"; 504 else next = " fld " ~ indexedVector(typelist, operations[0]) ~ ";\n"; 505 mainbody ~=next; firstbody~=next; 506 ++numOnStack; 456 507 } 457 508 if (!isDotProduct && finaloperation.length>1) { 458 if (finaloperation[0]=='*') mainbody =" fmul ST, ST(2);"\n; 459 else { 509 if (finaloperation[0]=='*') { 510 firstbody ~= " fmul ST, ST(" ~ itoa(numOnStack) ~ ");"\n; 511 // +1 because previous result is also on stack 512 mainbody ~= " fmul ST, ST(" ~ itoa(numOnStack+1) ~ ");"\n; 513 } else { 460 514 char [] finalop = "fadd"; 461 515 if (finaloperation[0]=='-') finalop="fsubr"; 462 516 if (typelist[$-1]=='R') { 463 517 // 80-bit vectors must be loaded onto the FPU stack first 464 mainbody ~= " fld real ptr [" ~ vectorRegister(numvecs-1) ~ " + " 465 ~ vectorSize_LEA(typelist[$-1]) ~ "];"\n; 518 mainbody ~= " fld real ptr [" ~ vectorRegister(numvecs-1) ~ "];"\n; 466 519 mainbody ~= " " ~ finalop ~ "p ST(1), ST;\n"; 467 520 } else { … … 471 524 } 472 525 } 473 result ~= \n ~ mainbody ~ " jmp short L2;\n"474 ~ " L1:\n" ~ mainbody;526 result ~= \n ~ firstbody ~ " jmp short L2;\n" 527 ~ " align 4;\n" ~ "L1:\n" ~ mainbody; 475 528 476 529 result ~= " fxch ST(1), ST;\n"; // get previous result 477 530 if (isDotProduct) result ~= " faddp ST(2), ST;"\n; 478 else result ~= " fstp " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister(numvecs-1) ~ " + " ~ vectorSize_LEA(typelist[$-1]) ~ " - " ~ vectorSize(typelist[$-1]) ~ "];"\n; 531 else { 532 result ~= storeVector(typelist[$-1], numvecs-1, " - " ~ vectorSize(typelist[$-1])); 533 } 479 534 480 535 result ~= "L2: \n"; 481 536 482 537 result~= incrementRealVectors ~ " inc ESI;\n jnz L1;\n"; 538 483 539 if (isDotProduct) result ~= " faddp ST(1), ST;"\n; 484 else result ~= " fstp " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister(numvecs-1) ~ " + " ~ vectorSize_LEA(typelist[$-1]) ~ "];"\n; 540 else result ~= storeVector(typelist[$-1], numvecs-1); 541 542 // Discard any scalars that are left on the stack. 543 if (isDotProduct && numScalarsOnStack>0) { 544 // Preserve the result of the dot product 545 result ~= " fxch ST(" ~ itoa(numScalarsOnStack) ~ "), ST;"\n; 546 } 547 while (numScalarsOnStack>1) { 548 result~= " fcompp ST(0), ST;"\n; 549 numScalarsOnStack-=2; 550 } 551 if (numScalarsOnStack==1) result~= " fstp ST(0), ST;"\n; 552 553 485 554 result~= "L3:" \n ~ popRegisters(vecnum) ~ "}\r\n"; 486 555 return result;
