root/trunk/blade/Performance/Perform.d

Revision 177, 10.9 kB (checked in by Don Clugston, 4 years ago)

Added performance test using MSR registers; the device driver comes from Agner Fog. Only works on my system (path is hard-coded, only works on Windows, Pentium M and Pentium Core (not core2)).

  • Property svn:mime-type set to text/x-dsrc
  • Property svn:eol-style set to native
Line 
1 /**
2  * Performance tests
3  * Author: Don Clugston. Portions by Agner Fog.
4  * License: Modified BSD, (but note that it uses GPL-licensed modules).
5  *
6  * Determine the performance of a section of code, using the built-in
7  * performance counters of Pentium processors.
8  * The performance counters are set using a device driver obtained from Agner
9  * Fog's website (www.agner.org); his code is licenced under the GPL.
10  * Since, in most cases, only two performance counters can be used at once,
11  * the code is run multiple times with different performance counters enabled.
12  *
13  * REPEATABILITY
14  * It is popular to raise the priority of thread to real-time priority, to avoid
15  * interrupts, but this does not seem to be beneficial (Windows time slices are
16  * typically 20ms, even for low-priority threads). It seems better to attempt to
17  * detect corrupted measurements, and discard them. The technique used here is
18  * not infallible (fails once in every few thousand runs).
19  *
20  * TODO: Might be better to include OS time as well (just in case task switch occurs?)
21  */
22 module Blade.Performance.Perform;
23 import Blade.Performance.MSRDriver;
24 import std.stdio;
25 //import std.cpuid;
26 import win32.core;
27 import std.string;
28
29 scope class PerformanceTester
30 {
31 private:
32     CMSRDriver driver;
33 public:    
34     this() {
35         driver = new CMSRDriver;
36         driver.LoadDriver();
37         lockToSingleCPU();
38         //regsToMeasure = regsToMeasurePM;
39     }
40     ~this() {
41         driver.UnloadDriver();
42     }
43     struct AllPerfMeasurements
44     {
45         long timestamp;
46         int[38] perfCounter;
47     }
48
49 private:
50    
51  AllPerfMeasurements all;
52
53  // The results of reading the timestamp clock and the performance counters
54  struct PerfCounterMeasurement {
55     ulong timestamp;
56     uint[2] perfCounter;
57  };
58
59 // Serialize, then read time stamp clock and performance counters 0 and 1
60 static void readCounters(PerfCounterMeasurement *dest) {
61    asm {
62        naked;
63       // ENTRY: dest is in EAX (D calling convention).
64       // NOTE: The rdpmc instruction corrupts EAX,ECX and EDX.
65       // cpuid corrupts EAX,EBX, ECX, EDX
66       // rdtsc corrupts EAX, EDX.
67       push EBX;
68       push EAX;
69       xor EAX, EAX;
70       cpuid; // Serialize. Corrupts EAX, EBX, ECX, EDX
71       rdtsc;
72       pop EBX;   // EBX = dest
73       mov [EBX+0], EAX;  // time low
74       mov [EBX+4], EDX;  // time high
75       mov ECX, 0;
76       rdpmc;
77       mov [EBX+8], EAX;  // performance counter 0
78       mov ECX, 1;
79       rdpmc;
80       mov [EBX+12], EAX; // performance counter 1
81       xor EAX, EAX;
82       cpuid; // serialize again
83       pop EBX;
84       ret;
85    }
86  }
87  // low 16 bits is event number, bits 24..17 are unit mask.
88  const uint[] regsToMeasure = [0xC0, 0xD0, 0xC2, 0x03,
89                             0xA2, 0x04, 0xD2, 0x05,
90                             0x43, 0x46, 0x47, 0x48,
91                             0x86, 0x81, 0x87, 0x85,
92                             0xF28, 0xF2E, 0xF29, 0xF2A,
93                             0xF0,
94                             0x24, 0x26,
95                             0xC4, 0xC9, 0xC5, 0xCA, 0xE2, 0xE6,
96                             0xC8, 0x06,
97                             0x93, 0x90, 0xDA, 0xA0 // PM only
98                             , 0x79, 0x58, 0x1F29
99                             ];
100  const char [][] shortDesc = [
101     "Instrct", "InDecod", "Uops(F)", "LdBlks",
102     "ResStll", "Drains",  "PartRAT", "Misalgn",
103     "DataRef", "DtLnsIn", "DtLnOut", "DtMis",
104     "IFtStal", "IFtcMis", "ILdStal", "ITlbMis",
105     "L2IFtch", "L2Req",   "L2Ld",    "L2St",
106     "PrefUp",
107     "L2In",    "L2Out",
108     "Brnches", "BrTaken", "BrMisp",  "BrTkMis", "BTBMiss", "BAClrs",
109     "HwIntru", "SegRegL",
110     "CallMis", "RetMisp", "FusdUop", "UopsUF",
111     "Unhaltd", "SpdStep", "L2LdHw" // hardware prefetch
112     ];
113        
114     // Notes:
115     // 0xA0 is "unfused microoperations submitted to execution units (Undocumented counter!)"(Agner Fog).
116     // SbDrains gives time lost to CPUID, interrupts, etc
117    
118 //    uint[] regsToMeasure;
119 public:
120  void DoPerformanceTest(void delegate() testfunc, void delegate() initfunc, void delegate() nulltestfunc=null)
121  {
122     void nulltest() { }
123     if (nulltestfunc is null) nulltestfunc = &nulltest;
124    
125     SMSRInOut[4] startcmd;
126     SMSRInOut[4] stopcmd;   
127    
128     const int PerfEvtSel0 = 0x186;
129     const int PerfEvtSel1 = 0x187;
130     const int PerfCtr0 = 0xC1;
131     const int PerfCtr1 = 0xC2;
132
133     startcmd[0] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel0, 0);
134     startcmd[1] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr0, 0L);
135     startcmd[2] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel1, 0);
136     startcmd[3] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr1, 0L);
137    
138     stopcmd[0] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel0, 0L);
139     stopcmd[1] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr0, 0L);
140     stopcmd[2] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel1, 0L);
141     stopcmd[3] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr1, 0L);
142    
143     PerfCounterMeasurement[4][10] results;   
144     int standard_time_stamp = int.max;
145     int standard_null_time_stamp = int.max;
146
147     for (int countersetnum=0; countersetnum<regsToMeasure.length; countersetnum+=2) {
148         // Bit 22 on PervEvtSel0 = enable counters.
149         // Bit 16 = count non-OS events. Bit 17 = count OS events.
150         // bit 18 = edge detect. Bit 24-31 = counter mask (only count multiple events per cycle).
151         startcmd[0].value = regsToMeasure[countersetnum]   | 0x0041_0000;
152         startcmd[2].value = regsToMeasure[countersetnum+1] | 0x0001_0000;
153
154         PerfCounterMeasurement minMeas; // smallest value for the test code
155         PerfCounterMeasurement secondMeas; // candiate for minMeas
156         PerfCounterMeasurement minNullMeas; // smallest value for the null code
157         PerfCounterMeasurement secondNullMeas; // candiate for minNullMeas
158         // The lowest value is usually, but not always, the best.
159         // Occasionally, a measurement gives a spuriously low value. We guard against
160         // this by choosing the smallest measurement which occurs more than once.
161         int countMinTime = 0; // The number of measurements which have achieved the min time.
162         int secondMinTime = 0;
163         int countNullTime = 0;
164         int secondNullTime = 0;
165    
166         do {
167             driver.AccessRegisters(startcmd);   
168             for (int i=0; i<8;++i) {
169                 initfunc();
170                 readCounters(&results[i][0]);
171                 testfunc();
172                 readCounters(&results[i][1]);
173                 initfunc();
174                 readCounters(&results[i][2]);
175                 nulltestfunc();
176                 readCounters(&results[i][3]);
177             }
178             driver.AccessRegisters(stopcmd);       
179             minMeas.timestamp = uint.max;
180             minNullMeas.timestamp = uint.max;
181             countMinTime = 0;
182             countNullTime = 0;
183             secondMinTime = 0;
184             secondNullTime = 0;
185             for (int j=0; j<2;++j) {
186                 minMeas.perfCounter[j]=uint.max;
187                 secondMeas.perfCounter[j]=uint.max;
188                 minNullMeas.perfCounter[j]=uint.max;
189                 secondNullMeas.perfCounter[j]=uint.max;
190             }
191             // Start from end, since final measurements are usually most reliable.
192             for (int i=7; i>=0; --i) {
193                 uint u = (results[i][1].timestamp-results[i][0].timestamp);
194                 if (u==minMeas.timestamp) ++countMinTime;
195                 if (u==secondMeas.timestamp) ++secondMinTime;
196                 else if (u < minMeas.timestamp) {
197                     if (countMinTime>0) {
198                         secondMinTime = countMinTime;
199                         secondMeas  = minMeas;
200                     }
201                     minMeas.timestamp=u;
202                     countMinTime=1;
203                 } else if (u < secondMeas.timestamp) {
204                     secondMeas.timestamp = u; secondMinTime=1;
205                 }
206                 u = (results[i][3].timestamp-results[i][2].timestamp);             
207                 if (u==minNullMeas.timestamp) ++countNullTime;
208                 if (u==secondNullMeas.timestamp) ++secondNullTime;
209                 if (u < minNullMeas.timestamp) {
210                     if (countNullTime>0) {
211                         secondNullTime = countNullTime;
212                         secondNullMeas = minNullMeas;
213                     }
214                     minNullMeas.timestamp=u;
215                     countNullTime = 1;
216                 } else if (u<secondNullMeas.timestamp) {
217                     secondNullMeas.timestamp = u; secondNullTime=1;
218                 }
219                 for (int j=0; j<2; ++j) {
220                     u = results[i][1].perfCounter[j]-results[i][0].perfCounter[j];
221                     if (u<minMeas.perfCounter[j]) minMeas.perfCounter[j]=u;
222                     u = results[i][3].perfCounter[j]-results[i][2].perfCounter[j];
223                     if (u<minNullMeas.perfCounter[j]) minNullMeas.perfCounter[j]=u;
224                 }
225             }
226          //if (countMinTime==0 && secondMinTime==0), we have a totally random result
227          
228          // The mintime must have occured at least twice, and there must not be two candidates.
229         } while ((countMinTime<3 && secondMinTime<3) || (countMinTime>2 && secondMinTime>2) ||(countNullTime<3 && secondNullTime<3));
230        
231         if (countMinTime<3 && secondMinTime>=3) {
232             minMeas = secondMeas;
233         }
234         if (countNullTime<3 && secondNullTime>=3) {
235             minNullMeas = secondNullMeas;
236         }
237         bool unusual = false;
238    
239         if (standard_time_stamp==int.max) {
240             standard_time_stamp = minMeas.timestamp;
241             standard_null_time_stamp = minNullMeas.timestamp;
242             writefln("Standard=(%d, %d)",standard_time_stamp, standard_null_time_stamp);
243         } else {
244             if (standard_time_stamp != minMeas.timestamp || standard_null_time_stamp !=minNullMeas.timestamp) {
245                 writefln("Standard=(%d,%d), this=(%d,%d) second=(%d %d)", standard_time_stamp, standard_null_time_stamp, minMeas.timestamp,minNullMeas.timestamp, secondMeas.timestamp,secondNullMeas.timestamp);
246                 unusual = true;
247             }
248         }
249         all.timestamp = standard_time_stamp-standard_null_time_stamp; //minMeas.timestamp - minNullMeas.timestamp;
250         all.perfCounter[countersetnum] = minMeas.perfCounter[0]-minNullMeas.perfCounter[0];
251         all.perfCounter[countersetnum+1] = minMeas.perfCounter[1]-minNullMeas.perfCounter[1];
252    
253         if (unusual) {       
254             writefln("Different timestamp: %s %s %d %d ####################", shortDesc[countersetnum*2], shortDesc[countersetnum], all.timestamp, minMeas.timestamp - minNullMeas.timestamp);
255             for (int i=0; i<8; ++i) {
256                     writefln("%d --> %d,%d", i, results[i][1].timestamp-results[i][0].timestamp, results[i][3].timestamp-results[i][2].timestamp);
257             }
258         }   
259     }
260  }
261
262  void describe()
263  {
264     writefln("ALL RESULTS");
265     char [] heading = "Ticks" ~ \t;
266     char [] vals = std.string.toString(all.timestamp) ~ \t;
267     int kol=1;
268     for(int i=0; i<shortDesc.length; ++i) {
269         heading ~= shortDesc[i]~ \t;
270         vals ~= std.string.toString(all.perfCounter[i])~\t;
271         ++kol;
272         if (kol==8 || i==shortDesc.length-1) {
273             writefln(heading); writefln(vals); heading=""; vals=""; kol=0;
274         }
275     }
276  }
277 }
Note: See TracBrowser for help on using the browser.