Changeset 91
- Timestamp:
- 03/26/07 04:48:51 (2 years ago)
- Files:
-
- trunk/mathextra/Blade.d (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/mathextra/Blade.d
r90 r91 190 190 } 191 191 192 VectorExpr!(X, "a", Q, X[]) Vec(X, int Q)(X[Q] vals) { VectorExpr!(X, "a", Q, X[]) a; a.values[0]=vals; return a; } 193 VectorExpr!(X, "a", 0, X[]) Vec(X)(X[] vals) { VectorExpr!(X, "a", 0, X[]) a; a.values[0]=vals; return a; } 192 // Convert static arrays to dynamic, but remember the length as a compile-time parameter. 193 VectorExpr!(X, "a", Q, X[]) Vec(X, int Q)(X[Q] vals) { 194 VectorExpr!(X, "a", Q, X[]) a; 195 a.values[0]=vals; 196 return a; 197 } 198 199 VectorExpr!(X, "a", 0, X[]) Vec(X)(X[] vals) { 200 VectorExpr!(X, "a", 0, X[]) a; 201 a.values[0]=vals; return a; 202 } 194 203 195 204 // Returns ireal if one of A or B is real, and the other is imaginary. … … 205 214 } 206 215 207 // Note that this uses only built-in types.208 216 real performOperation(char [] operations, char [] finaloperation, int knownlength, X...)(X expr) 209 217 { … … 296 304 template singleType(A) 297 305 { 298 static if (is(A == real[]) || is (A==ireal[]))const char [] singleType = "R";299 else static if (is(A == double[])|| is(A == idouble[]))const char [] singleType = "D";300 else static if (is(A == float[]) || is(A==ifloat[]))const char [] singleType = "F";306 static if (is(A == real[]) || is(A==ireal[])) const char [] singleType = "R"; 307 else static if (is(A == double[])|| is(A == idouble[]))const char [] singleType = "D"; 308 else static if (is(A == float[]) || is(A==ifloat[])) const char [] singleType = "F"; 301 309 else static if (is(A == real)) const char [] singleType = "S"; 302 310 else const char [] singleType = "?"; 303 311 } 304 312 313 // A CTFE function can't randomly index a tuple, so convert the type information 314 // into a char[]. 305 315 template vectorTupleToString(X...) 306 316 { … … 332 342 } 333 343 344 345 int vectorNum(char [] typelist, char var) 346 { 347 int numVecs=0; 348 for (int i=0; i<var-'a'; ++i) { 349 if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F') ++numVecs; 350 } 351 return numVecs; 352 } 353 354 int scalarNum(char [] typelist, char var) 355 { 356 int k=0; 357 for (int i=0; i<var-'a'; ++i) { 358 if (typelist[i]=='S') ++k; 359 } 360 return k; 361 } 362 334 363 // ------------------------------- 335 364 // PART 4 -- Generate x87 ASM code … … 338 367 char [] operandSize(char var) 339 368 { 340 if (var == 'R') return "real ptr "; 341 else if (var == 'D') return "double ptr "; 342 else if (var == 'F') return "float ptr "; 343 else if (var == 'S') return "(scalar)"; 369 switch(var) { 370 case 'R': return "real ptr "; 371 case 'D': return "double ptr "; 372 case 'F': return "float ptr "; 373 } 344 374 } 345 375 346 376 char [] opToX87(char op) 347 377 { 348 if (op=='*' || op=='.') return "fmul"; 349 else if (op=='+') return "fadd"; 350 else if (op=='-') return "fsub"; 351 else if (op=='~') return "fsubr"; 352 } 378 switch (op) { 379 case '*': 380 case '.': return "fmul"; 381 case '+': return "fadd"; 382 case '-': return "fsub"; 383 case '~': return "fsubr"; 384 } 385 } 386 387 388 static if (real.sizeof==10) const char [] REALSIZE = "10"; 389 else static if (real.sizeof==12) const char [] REALSIZE = "12"; 390 else static if (real.sizeof==16) const char [] REALSIZE = "16"; 353 391 354 392 char [] vectorSize(char vartype) 355 393 { 356 if (vartype=='D') return "8"; 357 else if (vartype=='F') return "4"; 358 else if (vartype=='R') return REALSIZE; 359 } 360 361 int vectorNum(char [] typelist, char var) 362 { 363 int numVecs=0; 364 for (int i=0; i<var-'a'; ++i) { 365 if (typelist[i]=='R' || typelist[i]=='D' || typelist[i]=='F') ++numVecs; 366 } 367 return numVecs; 368 } 369 370 int scalarNum(char [] typelist, char var) 371 { 372 int k=0; 373 for (int i=0; i<var-'a'; ++i) { 374 if (typelist[i]=='S') ++k; 375 } 376 return k; 394 switch (vartype) { 395 case 'D': return "8"; 396 case 'F': return "4"; 397 case 'R': return REALSIZE; 398 } 377 399 } 378 400 … … 401 423 vectorRegister[vectorNum(typelist, var)] ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; 402 424 } 403 404 static if (real.sizeof==10) const char [] REALSIZE="10";405 else static if (real.sizeof==12) const char [] REALSIZE="12";406 else static if (real.sizeof==16) const char [] REALSIZE="16";407 425 408 426 char [] storeVector(char type, int vecnum, char [] stride="") … … 415 433 } 416 434 417 // Generate asm code which is optimal for x87 CPUs without SSE2 418 // (Pentium, PMMX, PII, PIII). 419 // Uses ESI as a counter and index variable, which begins negative and counts UP to zero. 420 // The latency of fstp is avoided by storing it in the subsequent iteration. 421 // The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. 435 /** Generate asm code which is optimal for x87 CPUs without SSE2 436 (Pentium, PMMX, PII, PIII). 437 The key optimisation rules are: 438 1. keep the loop overhead to one clock cycle if possible. 439 2. (FMUL latency) don't use the result of a multiply immediately 440 3. (FST latency) don't save a value to memory immediately after it's calculated. 441 Techniques to address these are: 442 1. Use ESI as a counter and index variable, which begins negative and counts UP to zero. 443 2. The latency of fmul is avoided by swapping fadd/fsub with fmul whenever possible. 444 3. The latency of fstp is avoided by calculating a result in one iteration, 445 but not storing it to memory until the subsequent iteration. 446 447 The generated code is of the form: 448 ---- 449 load scalars onto FPU stack 450 load vector pointers into EAX, EBX, ... 451 calculate result[0] into ST(0) 452 goto L2 453 L1: 454 calculate result[i+1] into ST(0) 455 swap so that result[i] is in ST(0) 456 L2: 457 store result[i] 458 increment pointers, goto L1 if i<n-1 459 store result[n-1] 460 pop scalars off FPU stack 461 ---- 462 463 */ 422 464 char [] makeAsmX87(char [] typelist, char [] operations, char [] finaloperation) 423 465 { … … 453 495 " mov ESI, veclength;"\n; // ESI will be the counter 454 496 455 // Load all the vector pointers into registers, and the scalars onto the stack 497 // Load all the vector pointers into registers, and push all the scalars onto the stack 498 456 499 int numvecs=0; 457 500 int numconsts=0; … … 473 516 } 474 517 475 if (isDotProduct) result ~= " fldz;"\n;518 if (isDotProduct) result ~= " fldz;"\n; 476 519 result ~= " xor ESI, ESI; "\n 477 520 " sub ESI, veclength; // counter=-length"\n … … 484 527 int done=0; 485 528 529 // Construct the main body of the loop (the main body does not include 530 // the final storage instruction, because of the FST latency). 486 531 char [] mainbody = ""; 487 532 char [] firstbody = ""; … … 518 563 // NOTE: For scalar float or double values, we can multiply directly, saving one slot on the FP stack. 519 564 // next = " " ~ opToX87(operations[done+1]) ~ " double ptr var" ~ itoa(operations[done]-'a') ~";\n"; 520 //mainbody ~= next; firstbody ~= next;565 // mainbody ~= next; firstbody ~= next; 521 566 done +=2; 522 567 }
