Changeset 89
- Timestamp:
- 03/22/07 04:50:33 (2 years ago)
- Files:
-
- trunk/mathextra/Blade.d (modified) (24 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/mathextra/Blade.d
r88 r89 4 4 * Generate near-optimal x87 asm code for BLAS1 basic vector operations at compile time. 5 5 * 32, 64 and 80 bit vectors are all supported. 6 * Uses techniques described in Agner Fog's superb optimisation manual (www.agner.org). 7 * 8 * Author: 9 * Don Clugston. 10 * License: 11 * Public domain. 6 12 * 7 13 * FEATURES: 8 14 * - Supports any mix of vector addition, subtraction, dot product, and multiplication 9 * by a scalar.10 * - 80-bit precision is used whe reever possible (and it does not reduce execturion speed).15 * by a real scalar. 16 * - 80-bit precision is used whenever possible (and does not reduce execution speed). 11 17 * - Supports mixed-length operations (eg, real[] + double[] + float[]). 12 18 * - When static arrays are used, mismatches in array length are detected at compile time. … … 17 23 * and is faster than the unrolled solution originally published by Intel. 18 24 * 19 * BUGS :25 * BUGS/ FUTURE DIRECTIONS: 20 26 * - Not well tested. No unit tests are included! 21 * - Not always optimal for 80-bit vectors (can save an instruction by changing float[] + real[] into real[]+float[]).22 27 * - Not optimal for the case of multiple real vectors (they could share a counter). 23 28 * - Not optimal for the case where all vectors are 80-bit (two counters are used, but only is required). … … 103 108 104 109 /** 105 * A proxy for a temporary result ofvector type.110 * A proxy for a vector operation returning a vector type. 106 111 * Stores all the arguments as a tuple, and the operations 107 112 * as a character string. 113 * Note that if operations=="a", it's a single vector; otherwise, it's a temporary. 114 * 'knownlength' is the length of vector, when known at compile time. 115 * If 'knownlength' == 0, it is unknown at compile time. 108 116 */ 109 117 struct VectorExpr(char[] operations, int knownlength, B...) { … … 122 130 return q; 123 131 } 124 static if (operations [$-2]=='*') {132 static if (operations.length>1 && operations[$-2]=='*') { 125 133 // Already a scalar multiply, so constant fold it. 126 134 VectorExpr!(operations, knownlength, B) opMul(real x) { … … 138 146 return JoinResult!("-", C.ops, C.ValueTuple)(values, x.values); 139 147 } 140 } 141 142 private const char [] VectorProxyA = "a"; 143 144 /** 145 * A simple wrapper for a built-in array of type X. 146 * Allows vector addition and subtraction of *any* array type; we'll catch 147 * incompatibilities at the code generation step. 148 * if 'knownlength' == 0, it is unknown at compile time. 149 */ 150 struct VectorProxy(X, int knownlength) { 151 alias VectorProxyA ops; 152 alias knownlength len; 153 alias X[] ValueTuple; 154 X [] values; 155 156 static VectorProxy!(X, 0) opCall(dummy=void)(X [] vals) { VectorProxy!(X, 0) a; a.values=vals; return a; } 157 static VectorProxy!(X, Q) opCall(int Q)(X [Q] vals) { VectorProxy!(X, Q) a; a.values=vals; return a; } 148 149 // The opAssign operations are only valid for single vectors, not for temporaries 150 static if (operations=="a") { 158 151 void opAssign(A)(A expr) { 159 152 static assert(len==0 || expr.len == 0 || len == expr.len, "Vector lengths must match"); 160 performOperation!(expr.ops, "=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values);153 performOperation!(expr.ops, "=", len==0? expr.len : len, expr.ValueTuple, B[0])(expr.values, values); 161 154 } 162 155 void opAddAssign(A)(A expr) { 163 156 static assert(len==0 || expr.len == 0 || len == expr.len, "Vector lengths must match"); 164 performOperation!(expr.ops, "+=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values);157 performOperation!(expr.ops, "+=", len==0? expr.len : len, expr.ValueTuple, B[0])(expr.values, values); 165 158 } 166 159 void opSubAssign(A)(A expr) { 167 160 static assert(len==0 || expr.len == 0 || len == expr.len, "Vector lengths must match"); 168 performOperation!(expr.ops, "-=", len==0? expr.len : len, expr.ValueTuple, X[])(expr.values, values);161 performOperation!(expr.ops, "-=", len==0? expr.len : len, expr.ValueTuple, B[0])(expr.values, values); 169 162 } 170 163 void opMulAssign(A)(A w) { // Use a template, because we don't want to generate this code 171 164 // unless it's actually used. 172 165 static assert(is(A: real)); 173 performOperation!("a", "*=", knownlength, real, X[])(w, values); 174 } 175 VectorExpr!("a*b", knownlength, X[], real) opMul(real w) { 176 return VectorExpr!("a*b", knownlength, X[], real)(values, w); 177 } 178 VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[]) opAdd(int T, Q)(Q[T] w) { 179 static assert(T==0 || knownlength==0 || T==knownlength); 180 return VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[])(values, w); 181 } 182 VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[]) opAdd(Q, int T)(VectorProxy!(Q, T) w) { 183 static assert(T==0 || knownlength==0 || T==knownlength); 184 return VectorExpr!("a+b", knownlength==0 ? T:knownlength, X[], Q[])(values, w.values); 185 } 186 VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[]) opSub(int T, Q)(Q[T] w) { 187 static assert(T==0 || knownlength==0 || T==knownlength); 188 return VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[])(values, w); 189 } 190 VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[]) opSub(Q, int T)(VectorProxy!(Q, T) w) { 191 static assert(T==0 || knownlength==0 || T==knownlength); 192 return VectorExpr!("a-b", knownlength==0 ? T:knownlength, X[], Q[])(values, w.values); 193 } 194 } 195 196 VectorProxy!(X, 0) Vec(X)(X[] vals) { return VectorProxy!(X, 0)(vals); } 197 VectorProxy!(X, Q) Vec(X, int Q)(X[Q] vals) { return VectorProxy!(X, Q)(vals); } 166 performOperation!("a", "*=", knownlength, real, B[0])(w, values); 167 } 168 } 169 } 170 171 172 VectorExpr!("a", Q, X[]) Vec(X, int Q)(X[Q] vals) { VectorExpr!("a", Q, X[]) a; a.values[0]=vals; return a; } 173 VectorExpr!("a", 0, X[]) Vec(X)(X[] vals) { VectorExpr!("a", 0, X[]) a; a.values[0]=vals; return a; } 198 174 199 175 real dot(A, B)(A a, B b) … … 206 182 real performOperation(char [] operations, char [] finaloperation, int knownlength, X...)(X expr) 207 183 { 208 pragma(msg, finaloperation ~ operations); 209 const char [] post = makePostfix(operations); 184 pragma(msg, finaloperation ~ " " ~ operations); 210 185 const char [] tupstr = vectorTupleToString!(X); 186 const char [] post = makePostfixForX87(operations, tupstr); 211 187 pragma(msg, post); 212 188 pragma(msg, tupstr); 213 189 static if (knownlength!=0) pragma(msg, "Length is known!"); 214 const char [] qqq = makeAsmX87(vectorTupleToString!(X), makePostfix(operations), finaloperation); 190 const char [] qqq = makeAsmX87(tupstr, makePostfixForX87(operations, tupstr), finaloperation); 191 const char [] qqq2 = makeAsmX87(tupstr, makePostfix(operations), finaloperation); 215 192 pragma(msg, qqq); 216 193 217 mixin(makeAsmX87(vectorTupleToString!(X), makePostfix (operations), finaloperation));194 mixin(makeAsmX87(vectorTupleToString!(X), makePostfixForX87(operations, tupstr), finaloperation)); 218 195 } 219 196 … … 249 226 } 250 227 228 // Apply x87-specific optimisations while converting to postfix 229 char [] makePostfixForX87(char [] operations, char [] typelist) 230 { 231 if (operations.length==1) return operations; 232 233 int x = exprLength(operations); 234 char [] first = operations[0..x+1]; 235 char [] second = operations[x+2..$]; 236 if (first[0]=='(') { 237 first = makePostfixForX87(first[1..$-1], typelist); 238 } 239 if (second[0]=='(') { 240 second = makePostfixForX87(second[1..$-1], typelist); 241 } 242 // x87 OPTIMISATION #1 243 // On x87, fmul has a long latency, so we want to delay using the 244 // result of a multiply. Since + is commutative, we can achieve this 245 // by calculating the value with the multiply, before the other one. 246 if (operations[x+1]=='+') { 247 if (second[$-1]=='*' && first[$-1]!='*') { 248 return second ~ first ~ operations[x+1..x+2]; 249 } 250 } 251 // We can also do the same thing with -, but we need to use fsubr 252 // instead of fsub. 253 if (operations[x+1]=='-') { 254 if (second[$-1]=='*' && first[$-1]!='*') { 255 return second ~ first ~ "~"; 256 } 257 } 258 // x87 OPTIMISATION #2 259 // When an operation is performed between a real[] and a non-real[], 260 // we want to have the real[] being the one which is loaded first. 261 if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='+') { 262 return second ~ first ~ "+"; 263 } 264 if (second.length==1 && typelist[second[0]-'a']=='R' && operations[x+1]=='-') { 265 return second ~ first ~ "~"; 266 } 267 return first ~ second ~ operations[x+1..x+2]; 268 } 269 251 270 template singleType(A) 252 271 { … … 268 287 // ------------------------------- 269 288 270 271 289 bool isInstruction(char op) 272 290 { 273 return (op=='+' || op=='*' || op=='-'|| op=='.'); 274 } 275 291 return (op=='+' || op=='*' || op=='-'|| op=='.'|| op=='~'); 292 } 276 293 277 294 bool isVector(char var) 278 295 { 279 296 return (var=='R' || var=='D' || var=='F'); 280 }281 282 bool isRealVector(char [] typelist, char var)283 {284 return typelist[var-'a']=='R';285 297 } 286 298 … … 304 316 else if (var == 'F') return "float ptr "; 305 317 else if (var == 'S') return "(scalar)"; 306 else return "(???)";307 318 } 308 319 … … 312 323 else if (op=='+') return "fadd"; 313 324 else if (op=='-') return "fsub"; 314 } 315 316 char [] vectorSize_LEA(char var) 317 { 318 if (var=='D') return "8*ESI"; 319 else if (var=='F') return "4*ESI"; 320 else return "ESI"; 325 else if (op=='~') return "fsubr"; 321 326 } 322 327 … … 326 331 else if (vartype=='F') return "4"; 327 332 else if (vartype=='R') return REALSIZE; 328 assert(0);329 333 } 330 334 … … 349 353 // First, use the scratch registers (EAX, ECX, EDX). If that's not enough, 350 354 // use EBX, ESI, and EDI. Finally, use the frame register EBP. 351 const char [][5] VectorRegisters = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 352 353 char [] vectorRegister(int vecnum) 354 { 355 return VectorRegisters[vecnum]; 356 } 355 const char [][5] vectorRegister = ["EAX", "ECX", "EDX", "EBX", "EDI"]; 357 356 358 357 char [] pushRegisters(int numVectors) 359 358 { 360 359 char [] result = " push ESI;"; 361 for (int i=3; i<numVectors; ++i) result~= " push " ~ VectorRegisters[i] ~ ";";360 for (int i=3; i<numVectors; ++i) result~= " push " ~ vectorRegister[i] ~ ";"; 362 361 return result ~ "\n"; 363 362 } … … 366 365 { 367 366 char [] result = " "; 368 for (int i=numVectors-1; i>=3; --i) result~= "pop " ~ VectorRegisters[i] ~ "; ";367 for (int i=numVectors-1; i>=3; --i) result~= "pop " ~ vectorRegister[i] ~ "; "; 369 368 return result ~ "pop ESI;\n"; 370 369 } … … 372 371 char [] indexedVector(char [] typelist, char var) 373 372 { 374 if (typelist[var-'a']=='R') return " real ptr [" ~ vectorRegister (vectorNum(typelist, var))~ "]";373 if (typelist[var-'a']=='R') return " real ptr [" ~ vectorRegister[vectorNum(typelist, var)] ~ "]"; 375 374 return operandSize(typelist[var-'a']) ~ "[" ~ 376 vectorRegister (vectorNum(typelist, var))~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]";375 vectorRegister[vectorNum(typelist, var)] ~ " + " ~ vectorSize(typelist[var-'a']) ~ "*ESI]"; 377 376 } 378 377 … … 384 383 { 385 384 if (type=='R') { 386 return " fstp real ptr [" ~ vectorRegister (vecnum)~ stride ~ "];"\n;385 return " fstp real ptr [" ~ vectorRegister[vecnum] ~ stride ~ "];"\n; 387 386 } else { 388 return " fstp " ~ operandSize(type) ~ " [" ~ vectorRegister (vecnum) ~ " + " ~ vectorSize_LEA(type)~ stride ~ "];"\n;387 return " fstp " ~ operandSize(type) ~ " [" ~ vectorRegister[vecnum] ~ " + " ~ vectorSize(type)~ "*ESI" ~ stride ~ "];"\n; 389 388 } 390 389 } … … 412 411 result~= " auto vec" ~ itoa(i) ~ " = expr[" ~itoa(i) ~"].ptr;\n"; 413 412 if (typelist[i]=='R') { 414 incrementRealVectors ~= " add " ~ vectorRegister (vecnum)~ ", " ~ REALSIZE ~ ";\n";413 incrementRealVectors ~= " add " ~ vectorRegister[vecnum] ~ ", " ~ REALSIZE ~ ";\n"; 415 414 } 416 415 if (firstvec) { … … 421 420 } 422 421 } 423 assert(vecnum-1 < VectorRegisters.length, "Too many vectors!");422 assert(vecnum-1 < vectorRegister.length, "Too many vectors!"); 424 423 bool isDotProduct = (operations[$-1]=='.'); 425 424 int numScalarsOnStack=0; … … 427 426 result~= \n"asm {"\n ~ pushRegisters(vecnum) ~ 428 427 " mov ESI, veclength;"\n; // ESI will be the counter 429 430 428 431 429 // Load all the vector pointers into registers, and the scalars onto the stack … … 435 433 if (isVector(typelist[i])) { 436 434 if (typelist[i]=='R') { 437 result ~= " mov " ~ vectorRegister (numvecs) ~ ", dword ptrvec" ~ itoa(i) ~ ";"\n;435 result ~= " mov " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; 438 436 } else { 439 result ~= " lea " ~ vectorRegister (numvecs) ~ ", ["440 ~ vectorSize_LEA(typelist[i]) ~ "]; "441 ~ " add " ~ vectorRegister (numvecs)~ ", vec" ~ itoa(i) ~ ";"\n;437 result ~= " lea " ~ vectorRegister[numvecs] 438 ~ ", [" ~ vectorSize(typelist[i]) ~ "*ESI]; " 439 ~ " add " ~ vectorRegister[numvecs] ~ ", vec" ~ itoa(i) ~ ";"\n; 442 440 } 443 441 ++numvecs; … … 481 479 if (typelist[operations[done]-'a']=='R') { 482 480 // 80-bit vectors must be loaded onto the FPU stack first 483 next = " fld real ptr [" ~ vectorRegister (vectorNum(typelist, operations[done]))~ "];\n"481 next = " fld real ptr [" ~ vectorRegister[vectorNum(typelist, operations[done])] ~ "];\n" 484 482 ~ " " ~ opToX87(operations[done+1]) ~ "p ST(1), ST;\n"; 485 483 } else { … … 489 487 mainbody ~= next; firstbody ~= next; 490 488 done +=2; 491 } else { // multiply by scalar 489 } else { // multiply by scalar. Note that there's an extra item on the stack when we're in the body of the loop. 492 490 firstbody ~= " fmul ST, ST(" ~ itoa(numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; 493 491 mainbody ~= " fmul ST, ST(" ~ itoa(1 + numOnStack + numScalarsOnStack - scalarNum(typelist, operations[done]-'a')) ~ "); //var" ~ itoa(operations[done]-'a') ~ \n; … … 501 499 char [] next; 502 500 if (typelist[$-1]=='R') 503 next = " fld real ptr [" ~ vectorRegister (0)~ "];\n";501 next = " fld real ptr [" ~ vectorRegister[0] ~ "];\n"; 504 502 else next = " fld " ~ indexedVector(typelist, operations[0]) ~ ";\n"; 505 503 mainbody ~=next; firstbody~=next; … … 516 514 if (typelist[$-1]=='R') { 517 515 // 80-bit vectors must be loaded onto the FPU stack first 518 mainbody ~= " fld real ptr [" ~ vectorRegister (numvecs-1)~ "];"\n;516 mainbody ~= " fld real ptr [" ~ vectorRegister[numvecs-1] ~ "];"\n; 519 517 mainbody ~= " " ~ finalop ~ "p ST(1), ST;\n"; 520 518 } else { 521 mainbody ~= " " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister (numvecs-1)~ " + "522 ~ vectorSize _LEA(typelist[$-1]) ~ "];"\n;519 mainbody ~= " " ~ finalop ~ " " ~ operandSize(typelist[$-1]) ~ " [" ~ vectorRegister[numvecs-1] ~ " + " 520 ~ vectorSize(typelist[$-1]) ~ "*ESI];"\n; 523 521 } 524 522 } … … 565 563 auto q = Vec([3.5L, 1.1, 3.8]); 566 564 auto r = Vec([17.0f, 28.1, 1]); 567 q -= (( p+r)*18.0L*314.1L - (p-r))* 35;565 q -= ((r+p)*18.0L*314.1L - (p-r))* 35; 568 566 real d = dot(r, p+r+r); 569 567 writefln(d); 570 p*=2; 571 } 568 p = r - q*2.0; 569 p*=5.6L; // currently fails 570 }
