Download Reference Manual
The Developer's Library for D
About Wiki Forums Source Search Contact

Changeset 3945

Show
Ignore:
Timestamp:
09/29/08 14:43:38 (2 months ago)
Author:
Don Clugston
Message:

Bigint: Simple asm performance tests. Preparation for using cpuid cacheinfo.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/tango/math/internal/BignumX86.d

    r3911 r3945  
    3030 * 
    3131 *  Timing results (cycles per int) 
    32  *         PentiumM Core2 
    33  *  +,-      2.25   2.25 
    34  *  <<,>>    2.0    2.0 
    35  *  cmp      2.0    2.0 
    36  *  *        5.0 
    37  *  mulAdd   5.4 
    38  *  div     18.0 
     32 *             PentiumM Core2 
     33 *  +,-         2.25   2.25 
     34 *  <<,>>       2.0    2.0 
     35 *  *           5.0 
     36 *  mulAdd      5.4 
     37 *  div        18.0 
     38 *  mulAcc(32)  6.3 
     39 * 
     40 * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes 
     41 * function call overhead. 
     42 * The timing for Div is quite unpredictable. 
    3943 */ 
    4044 
     
    605609        mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr 
    606610        mov EBX, [ESP + LASTPARAM + 4*2]; // left.length 
    607         align 16; 
    608         nop; 
    609         nop; 
    610          
    611611        mov ESI, [ESP + LASTPARAM + 4*3];  // left.ptr 
    612612        lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass 
     
    680680        adc EBP, 0; 
    681681        mov [-4+EDI+4*EBX], EBP; 
    682          
    683682        add EDI, 4; 
    684683        cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]? 
     
    837836} 
    838837 
     838version(TangoPerformanceTest) { 
     839import tango.stdc.stdio; 
     840int clock() { asm { rdtsc; } } 
     841 
     842uint [2000] X1; 
     843uint [2000] Y1; 
     844uint [4000] Z1; 
     845 
     846void testPerformance() 
     847{ 
     848    // The performance results at the top of this file were obtained using 
     849    // a Windows device driver to access the CPU performance counters. 
     850    // The code below is less accurate but more widely usable. 
     851    // The value for division is quite inconsistent. 
     852    for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; } 
     853    int t, t0;     
     854    multibyteShr(Z1[0..2000], X1[0..2000], 7); 
     855    t0 = clock(); 
     856    multibyteShr(Z1[0..1000], X1[0..1000], 7); 
     857    t = clock(); 
     858    multibyteShr(Z1[0..2000], X1[0..2000], 7); 
     859    auto shrtime = (clock() - t) - (t - t0); 
     860    t0 = clock(); 
     861    multibyteAddSub!('+')(Z1[0..1000], X1[0..1000], Y1[0..1000], 0); 
     862    t = clock(); 
     863    multibyteAddSub!('+')(Z1[0..2000], X1[0..2000], Y1[0..2000], 0); 
     864    auto addtime = (clock() - t) - (t-t0); 
     865    t0 = clock(); 
     866    multibyteMul(Z1[0..1000], X1[0..1000], 7, 0); 
     867    t = clock(); 
     868    multibyteMul(Z1[0..2000], X1[0..2000], 7, 0); 
     869    auto multime = (clock() - t) - (t - t0); 
     870    multibyteMulAdd!('+')(Z1[0..2000], X1[0..2000], 217, 0); 
     871    t0 = clock(); 
     872    multibyteMulAdd!('+')(Z1[0..1000], X1[0..1000], 217, 0); 
     873    t = clock(); 
     874    multibyteMulAdd!('+')(Z1[0..2000], X1[0..2000], 217, 0); 
     875    auto muladdtime = (clock() - t) - (t - t0);         
     876    multibyteMultiplyAccumulate(Z1[0..64], X1[0..32], Y1[0..32]); 
     877    t = clock(); 
     878    multibyteMultiplyAccumulate(Z1[0..64], X1[0..32], Y1[0..32]); 
     879    auto accumtime = clock() - t; 
     880    t0 = clock(); 
     881    multibyteDivAssign(Z1[0..2000], 217, 0); 
     882    t = clock(); 
     883    multibyteDivAssign(Z1[0..1000], 37, 0); 
     884    auto divtime = (t - t0) - (clock() - t); 
     885     
     886    printf("-- BigInt asm performance (cycles/int) --\n");     
     887    printf("Add:        %.2f\n", addtime/1000.0); 
     888    printf("Shr:        %.2f\n", shrtime/1000.0); 
     889    printf("Mul:        %.2f\n", multime/1000.0); 
     890    printf("MulAdd:     %.2f\n", muladdtime/1000.0); 
     891    printf("Div:        %.2f\n", divtime/1000.0); 
     892    printf("MulAccum32: %.2f*n*n (total %d)\n\n", accumtime/(32.0*32.0), accumtime); 
     893} 
     894 
     895static this() 
     896{ 
     897    testPerformance(); 
     898} 
     899} 
     900 
     901 
    839902} // version(D_InlineAsm_X86) 
  • trunk/tango/math/internal/BiguintCore.d

    r3939 r3945  
    2121} 
    2222 
     23// private import tango.core.Cpuid; 
     24static this() 
     25{ 
     26    CACHELIMIT = 8000; // tango.core.Cpuid.datacache[0].size/2; 
     27} 
     28 
    2329private: 
     30// Limits for when to switch between multiplication algorithms. 
     31const int CACHELIMIT;   // Half the size of the data cache. 
     32 
    2433const uint [] ZERO = [0]; 
    2534const uint [] ONE = [1]; 
     
    764773} 
    765774 
    766 // Limits for when to switch between multiplication algorithms. 
    767 const int CACHELIMIT = 8000;   // Half the size of the data cache. 
    768775 
    769776/* Determine how much space is required for the temporaries