Download Reference Manual
The Developer's Library for D
About Wiki Forums Source Search Contact

Changeset 3874

Show
Ignore:
Timestamp:
08/11/08 04:21:33 (4 months ago)
Author:
Don Clugston
Message:

bignum: Major speedup for asymmetric multiplication: now ensures Karatsuba is used ONLY when it is actually beneficial; otherwise uses Schoolbook for such iterations. This algorithm is significantly better than anything else I've seen reported elsewhere.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/tango/math/internal/BiguintCore.d

    r3831 r3874  
    184184        *negative = true; 
    185185        large = y; small = x; 
    186 //        return biguintSubDifferentLength(y, x); 
    187186    } else { 
    188187        *negative = false; 
    189188        large = x; small = y; 
    190   //      return biguintSubDifferentLength(x, y); 
    191189    } 
    192190    // result.length will be equal to larger length, or could decrease by 1. 
     
    240238    } else return result[0..$-1]; 
    241239} 
    242      
     240 
    243241/** General unsigned multiply routine for bigints. 
    244242 *  Sets result = x*y. 
     
    285283     
    286284    uint half = (x.length >> 1) + (x.length & 1); 
    287     if (y.length <= half) { 
     285    if (2*y.length*y.length <= x.length*x.length) { 
    288286        // UNBALANCED MULTIPLY 
    289         // Use school multiply to cut into Karatsuba-sized squares, 
    290         // unless y is so small that Karatsuba isn't worthwhile. 
    291         // unbalanced case - use school multiply to cut into chunks, each sized  
    292         // y.length * y.length. Use Karatsuba on each chunk. 
    293         // TODO: It _might_ be better to use non-square chunks (and have fewer chunks). 
     287        // Use school multiply to cut into quasi-squares of Karatsuba-size 
     288        // or larger. The ratio of the two sides of the 'square' must be  
     289        // between 1.414:1 and 1:1. Use Karatsuba on each chunk.  
     290        // 
     291        // For maximum performance, we want the ratio to be as close to  
     292        // 1:1 as possible. To achieve this, we can either pad x or y. 
     293        // The best choice depends on the modulus x%y. 
     294        
     295        uint numchunks = x.length / y.length; 
     296        uint chunksize = y.length; 
     297        uint extra =  x.length % y.length; 
     298        uint maxchunk = chunksize + extra; 
     299        bool paddingY; // true = we're padding Y, false = we're padding X. 
     300        // padding Y will mean we have  
     301        if (extra*extra*2 < y.length*y.length) { 
     302            // The leftover bit is small enough that it should be incorporated 
     303            // in the existing chunks.             
     304            // Make all the chunks a tiny bit bigger 
     305            // (We're padding y with zeros) 
     306            chunksize += extra / cast(double)numchunks; 
     307            extra = x.length - chunksize*numchunks; 
     308            // there will probably be a few left over. 
     309            // Every chunk will either have size chunksize, or chunksize+1. 
     310            maxchunk = chunksize + 1; 
     311            paddingY = true; 
     312            assert(chunksize + extra + chunksize *(numchunks-1) == x.length ); 
     313        } else  { 
     314            // the extra bit is large enough that it's worth making a new chunk. 
     315            // (This means we're padding x with zeros, when doing the first one). 
     316            maxchunk = chunksize; 
     317            ++numchunks; 
     318            paddingY = false; 
     319            assert(extra + chunksize *(numchunks-1) == x.length ); 
     320        } 
     321        // We make the buffer a bit bigger so we have space for the partial sums. 
     322        uint [] scratchbuff = new uint[karatsubaRequiredBuffSize(maxchunk) + maxchunk * 2]; 
     323        uint [] partial = scratchbuff[$ - maxchunk * 2 .. $]; 
    294324         
    295         // The first chunk is bigger, since it also needs to cover the leftover bits. 
    296         uint chunksize =  y.length + (x.length % y.length); 
    297         // We make the buffer a bit bigger so we have space for the partial sums. 
    298         uint [] scratchbuff = new uint[karatsubaRequiredBuffSize(y.length+ chunksize) + y.length * 2+1]; 
    299         if (y.length == half) { 
    300             chunksize = x.length - y.length; 
    301             mulKaratsuba(result[0 .. y.length + chunksize], y, x[0 .. chunksize], scratchbuff); 
    302         } else { 
    303             mulKaratsuba(result[0 .. y.length + chunksize], x[0 .. chunksize], y, scratchbuff); 
    304         } 
    305         uint done = chunksize; 
    306         uint [] partial = scratchbuff[$-y.length*2 .. $]; 
     325        uint done; // how much of X have we done so far? 
     326        double residual = 0; 
     327        if (paddingY) { 
     328            // If the first chunk is bigger, do it first. We're padding y.  
     329          mulKaratsuba(result[0 .. y.length + chunksize + (extra>0?1:0)], x[0 .. chunksize + (extra>0?1:0)], y, scratchbuff); 
     330          done = chunksize + (extra>0?1:0); 
     331          if (extra) --extra; 
     332        } else { // Begin with the extra bit. 
     333            mulKaratsuba(result[0 .. y.length + extra], y, x[0..extra], scratchbuff); 
     334            done = extra; 
     335            extra = 0; 
     336        } 
     337        auto basechunksize = chunksize; 
    307338        while (done < x.length) { 
    308             chunksize = (done + y.length <= x.length) ? y.length :  x.length - done; 
    309             mulKaratsuba(partial[0 .. y.length + chunksize], x[done .. done+chunksize], y, scratchbuff); 
     339            chunksize = basechunksize + (extra>0? 1:0); 
     340            if (extra) --extra; 
     341             mulKaratsuba(partial[0 .. y.length + chunksize],  
     342                       x[done .. done+chunksize], y, scratchbuff); 
    310343            result[done + y.length .. done + y.length + chunksize]  
    311344                = partial[y.length .. y.length + chunksize]; 
    312345            simpleAddAssign(result[done .. done + y.length+chunksize], partial[0 .. y.length]); 
    313             done += y.length
     346            done += chunksize
    314347        } 
    315348        delete scratchbuff; 
     
    351384in {     
    352385    assert(result.length == left.length + right.length); 
    353     assert(right.length>1); 
     386//    assert(right.length>1); 
    354387} 
    355388body { 
     
    378411} 
    379412 
     413uint subSimple(uint [] result, uint [] left, uint [] right) 
     414in { 
     415    assert(result.length == left.length); 
     416    assert(left.length >= right.length); 
     417    assert(right.length>0); 
     418} 
     419body { 
     420    uint carry = multibyteAddSub!('-')(result[0..right.length], 
     421            left[0..right.length], right, 0); 
     422    if (right.length < left.length) { 
     423        result[right.length..left.length] = left[right.length .. $];             
     424        carry = multibyteIncrementAssign!('-')(result[right.length..$], carry); 
     425    } //else if (result.length==left.length+1) { result[$-1] = carry; carry=0; } 
     426    return carry; 
     427} 
     428 
     429 
    380430/*  result must be larger than right. 
    381431*/ 
     
    411461} 
    412462 
    413 //uint lastleftover = 0; 
    414463/* Sets result = x*y, using Karatsuba multiplication. 
    415 * Valid only for balanced multiplies, and x must be longer than y. 
    416 * ie 2*y.length > x.length >= y.length
     464* x must be longer or equal to y. 
     465* Valid only for balanced multiplies, where x is not shorter than y
    417466* It is efficient only if sqrt(2)*y.length > x.length >= y.length 
    418467* Params: 
     
    425474    if (x.length <= KARATSUBALIMIT) { 
    426475        return mulSimple(result, x, y); 
    427     }     
     476    } 
     477    // Must be almost square. 
     478    assert(2 * y.length * y.length >= x.length * x.length, "Asymmetric Karatsuba"); 
     479         
    428480    // Karatsuba multiply uses the following result: 
    429481    // (Nx1 + x0)*(Ny1 + y0) = (N*N) x1y1 + x0y0 + N * mid 
    430482    // where mid = (x1+x0)*(y1+y0) - x1y1 - x0y0 
    431483    // requiring 3 multiplies of length N, instead of 4. 
     484     
     485    // TODO: Knuth's variant is superior: 
     486    // (Nx1 + x0)*(Ny1 + y0) = (N*N) x1y1 + x0y0 - N * mid 
     487    // where mid = (x0-x1)*(y0-y1) - x1y1 - x0y0 
     488    // since then the lengths cannot exceed length N. 
    432489 
    433490    // half length, round up. 
    434491    uint half = (x.length >> 1) + (x.length & 1); 
    435     assert(y.length>half, "Asymmetric Karatsuba"); 
    436492     
    437493    uint [] x0 = x[0 .. half]; 
     
    460516    mulKaratsuba(resultLow, x0, y0, newscratchbuff); 
    461517    mid.simpleSubAssign(resultLow); 
    462     mulKaratsuba(resultHigh, x1, y1, newscratchbuff); 
     518 
     519    if (2 * y1.length * y1.length < x1.length * x1.length) { 
     520        // an asymmetric situation has been created. 
     521        // Worst case is if x:y = 1.414 : 1, then x1:y1 = 2.41 : 1. 
     522        // Applying one schoolbook multiply gives us two pieces each 1.2:1 
     523        if (y1.length <= KARATSUBALIMIT) { 
     524            mulSimple(resultHigh, x1, y1); 
     525        } else { 
     526            // divide x1 in two, then use schoolbook multiply on the two pieces. 
     527            // Note that this will be smaller than y1, except when x1 = 2*y1 + 1. 
     528            uint quarter = (x1.length >> 1) + (x1.length & 1); 
     529            bool ysmaller = (quarter >= y1.length); 
     530            mulKaratsuba(resultHigh[0..quarter+y1.length], ysmaller?x1[0..quarter]: y1,  
     531                ysmaller?y1:x1[0..quarter], newscratchbuff); 
     532            // Save the part which will be overwritten. 
     533            newscratchbuff[0..y1.length] = resultHigh[quarter..quarter+y1.length]; 
     534            mulKaratsuba(resultHigh[quarter..$], ysmaller?x1[quarter..$]: y1,  
     535                ysmaller?y1:x1[quarter..$], newscratchbuff[y1.length..$]); 
     536            resultHigh[quarter..$].simpleAddAssign(newscratchbuff[0..y1.length]);                 
     537        } 
     538    } else mulKaratsuba(resultHigh, x1, y1, newscratchbuff); 
    463539    mid.simpleSubAssign(resultHigh); 
    464      
     540         
    465541    // result += MID 
    466542    result[half..$].simpleAddAssign(mid);