 |
Changeset 3874
- Timestamp:
- 08/11/08 04:21:33
(4 months ago)
- Author:
- Don Clugston
- Message:
bignum: Major speedup for asymmetric multiplication: now ensures Karatsuba is used ONLY when it is actually beneficial; otherwise uses Schoolbook for such iterations. This algorithm is significantly better than anything else I've seen reported elsewhere.
-
Files:
-
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
| r3831 |
r3874 |
|
| 184 | 184 | *negative = true; |
|---|
| 185 | 185 | large = y; small = x; |
|---|
| 186 | | // return biguintSubDifferentLength(y, x); |
|---|
| 187 | 186 | } else { |
|---|
| 188 | 187 | *negative = false; |
|---|
| 189 | 188 | large = x; small = y; |
|---|
| 190 | | // return biguintSubDifferentLength(x, y); |
|---|
| 191 | 189 | } |
|---|
| 192 | 190 | // result.length will be equal to larger length, or could decrease by 1. |
|---|
| … | … | |
| 240 | 238 | } else return result[0..$-1]; |
|---|
| 241 | 239 | } |
|---|
| 242 | | |
|---|
| | 240 | |
|---|
| 243 | 241 | /** General unsigned multiply routine for bigints. |
|---|
| 244 | 242 | * Sets result = x*y. |
|---|
| … | … | |
| 285 | 283 | |
|---|
| 286 | 284 | uint half = (x.length >> 1) + (x.length & 1); |
|---|
| 287 | | if (y.length <= half) { |
|---|
| | 285 | if (2*y.length*y.length <= x.length*x.length) { |
|---|
| 288 | 286 | // UNBALANCED MULTIPLY |
|---|
| 289 | | // Use school multiply to cut into Karatsuba-sized squares, |
|---|
| 290 | | // unless y is so small that Karatsuba isn't worthwhile. |
|---|
| 291 | | // unbalanced case - use school multiply to cut into chunks, each sized |
|---|
| 292 | | // y.length * y.length. Use Karatsuba on each chunk. |
|---|
| 293 | | // TODO: It _might_ be better to use non-square chunks (and have fewer chunks). |
|---|
| | 287 | // Use school multiply to cut into quasi-squares of Karatsuba-size |
|---|
| | 288 | // or larger. The ratio of the two sides of the 'square' must be |
|---|
| | 289 | // between 1.414:1 and 1:1. Use Karatsuba on each chunk. |
|---|
| | 290 | // |
|---|
| | 291 | // For maximum performance, we want the ratio to be as close to |
|---|
| | 292 | // 1:1 as possible. To achieve this, we can either pad x or y. |
|---|
| | 293 | // The best choice depends on the modulus x%y. |
|---|
| | 294 | |
|---|
| | 295 | uint numchunks = x.length / y.length; |
|---|
| | 296 | uint chunksize = y.length; |
|---|
| | 297 | uint extra = x.length % y.length; |
|---|
| | 298 | uint maxchunk = chunksize + extra; |
|---|
| | 299 | bool paddingY; // true = we're padding Y, false = we're padding X. |
|---|
| | 300 | // padding Y will mean we have |
|---|
| | 301 | if (extra*extra*2 < y.length*y.length) { |
|---|
| | 302 | // The leftover bit is small enough that it should be incorporated |
|---|
| | 303 | // in the existing chunks. |
|---|
| | 304 | // Make all the chunks a tiny bit bigger |
|---|
| | 305 | // (We're padding y with zeros) |
|---|
| | 306 | chunksize += extra / cast(double)numchunks; |
|---|
| | 307 | extra = x.length - chunksize*numchunks; |
|---|
| | 308 | // there will probably be a few left over. |
|---|
| | 309 | // Every chunk will either have size chunksize, or chunksize+1. |
|---|
| | 310 | maxchunk = chunksize + 1; |
|---|
| | 311 | paddingY = true; |
|---|
| | 312 | assert(chunksize + extra + chunksize *(numchunks-1) == x.length ); |
|---|
| | 313 | } else { |
|---|
| | 314 | // the extra bit is large enough that it's worth making a new chunk. |
|---|
| | 315 | // (This means we're padding x with zeros, when doing the first one). |
|---|
| | 316 | maxchunk = chunksize; |
|---|
| | 317 | ++numchunks; |
|---|
| | 318 | paddingY = false; |
|---|
| | 319 | assert(extra + chunksize *(numchunks-1) == x.length ); |
|---|
| | 320 | } |
|---|
| | 321 | // We make the buffer a bit bigger so we have space for the partial sums. |
|---|
| | 322 | uint [] scratchbuff = new uint[karatsubaRequiredBuffSize(maxchunk) + maxchunk * 2]; |
|---|
| | 323 | uint [] partial = scratchbuff[$ - maxchunk * 2 .. $]; |
|---|
| 294 | 324 | |
|---|
| 295 | | // The first chunk is bigger, since it also needs to cover the leftover bits. |
|---|
| 296 | | uint chunksize = y.length + (x.length % y.length); |
|---|
| 297 | | // We make the buffer a bit bigger so we have space for the partial sums. |
|---|
| 298 | | uint [] scratchbuff = new uint[karatsubaRequiredBuffSize(y.length+ chunksize) + y.length * 2+1]; |
|---|
| 299 | | if (y.length == half) { |
|---|
| 300 | | chunksize = x.length - y.length; |
|---|
| 301 | | mulKaratsuba(result[0 .. y.length + chunksize], y, x[0 .. chunksize], scratchbuff); |
|---|
| 302 | | } else { |
|---|
| 303 | | mulKaratsuba(result[0 .. y.length + chunksize], x[0 .. chunksize], y, scratchbuff); |
|---|
| 304 | | } |
|---|
| 305 | | uint done = chunksize; |
|---|
| 306 | | uint [] partial = scratchbuff[$-y.length*2 .. $]; |
|---|
| | 325 | uint done; // how much of X have we done so far? |
|---|
| | 326 | double residual = 0; |
|---|
| | 327 | if (paddingY) { |
|---|
| | 328 | // If the first chunk is bigger, do it first. We're padding y. |
|---|
| | 329 | mulKaratsuba(result[0 .. y.length + chunksize + (extra>0?1:0)], x[0 .. chunksize + (extra>0?1:0)], y, scratchbuff); |
|---|
| | 330 | done = chunksize + (extra>0?1:0); |
|---|
| | 331 | if (extra) --extra; |
|---|
| | 332 | } else { // Begin with the extra bit. |
|---|
| | 333 | mulKaratsuba(result[0 .. y.length + extra], y, x[0..extra], scratchbuff); |
|---|
| | 334 | done = extra; |
|---|
| | 335 | extra = 0; |
|---|
| | 336 | } |
|---|
| | 337 | auto basechunksize = chunksize; |
|---|
| 307 | 338 | while (done < x.length) { |
|---|
| 308 | | chunksize = (done + y.length <= x.length) ? y.length : x.length - done; |
|---|
| 309 | | mulKaratsuba(partial[0 .. y.length + chunksize], x[done .. done+chunksize], y, scratchbuff); |
|---|
| | 339 | chunksize = basechunksize + (extra>0? 1:0); |
|---|
| | 340 | if (extra) --extra; |
|---|
| | 341 | mulKaratsuba(partial[0 .. y.length + chunksize], |
|---|
| | 342 | x[done .. done+chunksize], y, scratchbuff); |
|---|
| 310 | 343 | result[done + y.length .. done + y.length + chunksize] |
|---|
| 311 | 344 | = partial[y.length .. y.length + chunksize]; |
|---|
| 312 | 345 | simpleAddAssign(result[done .. done + y.length+chunksize], partial[0 .. y.length]); |
|---|
| 313 | | done += y.length; |
|---|
| | 346 | done += chunksize; |
|---|
| 314 | 347 | } |
|---|
| 315 | 348 | delete scratchbuff; |
|---|
| … | … | |
| 351 | 384 | in { |
|---|
| 352 | 385 | assert(result.length == left.length + right.length); |
|---|
| 353 | | assert(right.length>1); |
|---|
| | 386 | // assert(right.length>1); |
|---|
| 354 | 387 | } |
|---|
| 355 | 388 | body { |
|---|
| … | … | |
| 378 | 411 | } |
|---|
| 379 | 412 | |
|---|
| | 413 | uint subSimple(uint [] result, uint [] left, uint [] right) |
|---|
| | 414 | in { |
|---|
| | 415 | assert(result.length == left.length); |
|---|
| | 416 | assert(left.length >= right.length); |
|---|
| | 417 | assert(right.length>0); |
|---|
| | 418 | } |
|---|
| | 419 | body { |
|---|
| | 420 | uint carry = multibyteAddSub!('-')(result[0..right.length], |
|---|
| | 421 | left[0..right.length], right, 0); |
|---|
| | 422 | if (right.length < left.length) { |
|---|
| | 423 | result[right.length..left.length] = left[right.length .. $]; |
|---|
| | 424 | carry = multibyteIncrementAssign!('-')(result[right.length..$], carry); |
|---|
| | 425 | } //else if (result.length==left.length+1) { result[$-1] = carry; carry=0; } |
|---|
| | 426 | return carry; |
|---|
| | 427 | } |
|---|
| | 428 | |
|---|
| | 429 | |
|---|
| 380 | 430 | /* result must be larger than right. |
|---|
| 381 | 431 | */ |
|---|
| … | … | |
| 411 | 461 | } |
|---|
| 412 | 462 | |
|---|
| 413 | | //uint lastleftover = 0; |
|---|
| 414 | 463 | /* Sets result = x*y, using Karatsuba multiplication. |
|---|
| 415 | | * Valid only for balanced multiplies, and x must be longer than y. |
|---|
| 416 | | * ie 2*y.length > x.length >= y.length. |
|---|
| | 464 | * x must be longer or equal to y. |
|---|
| | 465 | * Valid only for balanced multiplies, where x is not shorter than y. |
|---|
| 417 | 466 | * It is efficient only if sqrt(2)*y.length > x.length >= y.length |
|---|
| 418 | 467 | * Params: |
|---|
| … | … | |
| 425 | 474 | if (x.length <= KARATSUBALIMIT) { |
|---|
| 426 | 475 | return mulSimple(result, x, y); |
|---|
| 427 | | } |
|---|
| | 476 | } |
|---|
| | 477 | // Must be almost square. |
|---|
| | 478 | assert(2 * y.length * y.length >= x.length * x.length, "Asymmetric Karatsuba"); |
|---|
| | 479 | |
|---|
| 428 | 480 | // Karatsuba multiply uses the following result: |
|---|
| 429 | 481 | // (Nx1 + x0)*(Ny1 + y0) = (N*N) x1y1 + x0y0 + N * mid |
|---|
| 430 | 482 | // where mid = (x1+x0)*(y1+y0) - x1y1 - x0y0 |
|---|
| 431 | 483 | // requiring 3 multiplies of length N, instead of 4. |
|---|
| | 484 | |
|---|
| | 485 | // TODO: Knuth's variant is superior: |
|---|
| | 486 | // (Nx1 + x0)*(Ny1 + y0) = (N*N) x1y1 + x0y0 - N * mid |
|---|
| | 487 | // where mid = (x0-x1)*(y0-y1) - x1y1 - x0y0 |
|---|
| | 488 | // since then the lengths cannot exceed length N. |
|---|
| 432 | 489 | |
|---|
| 433 | 490 | // half length, round up. |
|---|
| 434 | 491 | uint half = (x.length >> 1) + (x.length & 1); |
|---|
| 435 | | assert(y.length>half, "Asymmetric Karatsuba"); |
|---|
| 436 | 492 | |
|---|
| 437 | 493 | uint [] x0 = x[0 .. half]; |
|---|
| … | … | |
| 460 | 516 | mulKaratsuba(resultLow, x0, y0, newscratchbuff); |
|---|
| 461 | 517 | mid.simpleSubAssign(resultLow); |
|---|
| 462 | | mulKaratsuba(resultHigh, x1, y1, newscratchbuff); |
|---|
| | 518 | |
|---|
| | 519 | if (2 * y1.length * y1.length < x1.length * x1.length) { |
|---|
| | 520 | // an asymmetric situation has been created. |
|---|
| | 521 | // Worst case is if x:y = 1.414 : 1, then x1:y1 = 2.41 : 1. |
|---|
| | 522 | // Applying one schoolbook multiply gives us two pieces each 1.2:1 |
|---|
| | 523 | if (y1.length <= KARATSUBALIMIT) { |
|---|
| | 524 | mulSimple(resultHigh, x1, y1); |
|---|
| | 525 | } else { |
|---|
| | 526 | // divide x1 in two, then use schoolbook multiply on the two pieces. |
|---|
| | 527 | // Note that this will be smaller than y1, except when x1 = 2*y1 + 1. |
|---|
| | 528 | uint quarter = (x1.length >> 1) + (x1.length & 1); |
|---|
| | 529 | bool ysmaller = (quarter >= y1.length); |
|---|
| | 530 | mulKaratsuba(resultHigh[0..quarter+y1.length], ysmaller?x1[0..quarter]: y1, |
|---|
| | 531 | ysmaller?y1:x1[0..quarter], newscratchbuff); |
|---|
| | 532 | // Save the part which will be overwritten. |
|---|
| | 533 | newscratchbuff[0..y1.length] = resultHigh[quarter..quarter+y1.length]; |
|---|
| | 534 | mulKaratsuba(resultHigh[quarter..$], ysmaller?x1[quarter..$]: y1, |
|---|
| | 535 | ysmaller?y1:x1[quarter..$], newscratchbuff[y1.length..$]); |
|---|
| | 536 | resultHigh[quarter..$].simpleAddAssign(newscratchbuff[0..y1.length]); |
|---|
| | 537 | } |
|---|
| | 538 | } else mulKaratsuba(resultHigh, x1, y1, newscratchbuff); |
|---|
| 463 | 539 | mid.simpleSubAssign(resultHigh); |
|---|
| 464 | | |
|---|
| | 540 | |
|---|
| 465 | 541 | // result += MID |
|---|
| 466 | 542 | result[half..$].simpleAddAssign(mid); |
|---|
Download in other formats:
|
 |