| 1 |
/** |
|---|
| 2 |
* Performance tests |
|---|
| 3 |
* Author: Don Clugston. Portions by Agner Fog. |
|---|
| 4 |
* License: Modified BSD, (but note that it uses GPL-licensed modules). |
|---|
| 5 |
* |
|---|
| 6 |
* Determine the performance of a section of code, using the built-in |
|---|
| 7 |
* performance counters of Pentium processors. |
|---|
| 8 |
* The performance counters are set using a device driver obtained from Agner |
|---|
| 9 |
* Fog's website (www.agner.org); his code is licenced under the GPL. |
|---|
| 10 |
* Since, in most cases, only two performance counters can be used at once, |
|---|
| 11 |
* the code is run multiple times with different performance counters enabled. |
|---|
| 12 |
* |
|---|
| 13 |
* REPEATABILITY |
|---|
| 14 |
* It is popular to raise the priority of thread to real-time priority, to avoid |
|---|
| 15 |
* interrupts, but this does not seem to be beneficial (Windows time slices are |
|---|
| 16 |
* typically 20ms, even for low-priority threads). It seems better to attempt to |
|---|
| 17 |
* detect corrupted measurements, and discard them. The technique used here is |
|---|
| 18 |
* not infallible (fails once in every few thousand runs). |
|---|
| 19 |
* |
|---|
| 20 |
* TODO: Might be better to include OS time as well (just in case task switch occurs?) |
|---|
| 21 |
*/ |
|---|
| 22 |
module Blade.Performance.Perform; |
|---|
| 23 |
import Blade.Performance.MSRDriver; |
|---|
| 24 |
import std.stdio; |
|---|
| 25 |
//import std.cpuid; |
|---|
| 26 |
import win32.core; |
|---|
| 27 |
import std.string; |
|---|
| 28 |
|
|---|
| 29 |
scope class PerformanceTester |
|---|
| 30 |
{ |
|---|
| 31 |
private: |
|---|
| 32 |
CMSRDriver driver; |
|---|
| 33 |
public: |
|---|
| 34 |
this() { |
|---|
| 35 |
driver = new CMSRDriver; |
|---|
| 36 |
driver.LoadDriver(); |
|---|
| 37 |
lockToSingleCPU(); |
|---|
| 38 |
//regsToMeasure = regsToMeasurePM; |
|---|
| 39 |
} |
|---|
| 40 |
~this() { |
|---|
| 41 |
driver.UnloadDriver(); |
|---|
| 42 |
} |
|---|
| 43 |
struct AllPerfMeasurements |
|---|
| 44 |
{ |
|---|
| 45 |
long timestamp; |
|---|
| 46 |
int[38] perfCounter; |
|---|
| 47 |
} |
|---|
| 48 |
|
|---|
| 49 |
private: |
|---|
| 50 |
|
|---|
| 51 |
AllPerfMeasurements all; |
|---|
| 52 |
|
|---|
| 53 |
// The results of reading the timestamp clock and the performance counters |
|---|
| 54 |
struct PerfCounterMeasurement { |
|---|
| 55 |
ulong timestamp; |
|---|
| 56 |
uint[2] perfCounter; |
|---|
| 57 |
}; |
|---|
| 58 |
|
|---|
| 59 |
// Serialize, then read time stamp clock and performance counters 0 and 1 |
|---|
| 60 |
static void readCounters(PerfCounterMeasurement *dest) { |
|---|
| 61 |
asm { |
|---|
| 62 |
naked; |
|---|
| 63 |
// ENTRY: dest is in EAX (D calling convention). |
|---|
| 64 |
// NOTE: The rdpmc instruction corrupts EAX,ECX and EDX. |
|---|
| 65 |
// cpuid corrupts EAX,EBX, ECX, EDX |
|---|
| 66 |
// rdtsc corrupts EAX, EDX. |
|---|
| 67 |
push EBX; |
|---|
| 68 |
push EAX; |
|---|
| 69 |
xor EAX, EAX; |
|---|
| 70 |
cpuid; // Serialize. Corrupts EAX, EBX, ECX, EDX |
|---|
| 71 |
rdtsc; |
|---|
| 72 |
pop EBX; // EBX = dest |
|---|
| 73 |
mov [EBX+0], EAX; // time low |
|---|
| 74 |
mov [EBX+4], EDX; // time high |
|---|
| 75 |
mov ECX, 0; |
|---|
| 76 |
rdpmc; |
|---|
| 77 |
mov [EBX+8], EAX; // performance counter 0 |
|---|
| 78 |
mov ECX, 1; |
|---|
| 79 |
rdpmc; |
|---|
| 80 |
mov [EBX+12], EAX; // performance counter 1 |
|---|
| 81 |
xor EAX, EAX; |
|---|
| 82 |
cpuid; // serialize again |
|---|
| 83 |
pop EBX; |
|---|
| 84 |
ret; |
|---|
| 85 |
} |
|---|
| 86 |
} |
|---|
| 87 |
// low 16 bits is event number, bits 24..17 are unit mask. |
|---|
| 88 |
const uint[] regsToMeasure = [0xC0, 0xD0, 0xC2, 0x03, |
|---|
| 89 |
0xA2, 0x04, 0xD2, 0x05, |
|---|
| 90 |
0x43, 0x46, 0x47, 0x48, |
|---|
| 91 |
0x86, 0x81, 0x87, 0x85, |
|---|
| 92 |
0xF28, 0xF2E, 0xF29, 0xF2A, |
|---|
| 93 |
0xF0, |
|---|
| 94 |
0x24, 0x26, |
|---|
| 95 |
0xC4, 0xC9, 0xC5, 0xCA, 0xE2, 0xE6, |
|---|
| 96 |
0xC8, 0x06, |
|---|
| 97 |
0x93, 0x90, 0xDA, 0xA0 // PM only |
|---|
| 98 |
, 0x79, 0x58, 0x1F29 |
|---|
| 99 |
]; |
|---|
| 100 |
const char [][] shortDesc = [ |
|---|
| 101 |
"Instrct", "InDecod", "Uops(F)", "LdBlks", |
|---|
| 102 |
"ResStll", "Drains", "PartRAT", "Misalgn", |
|---|
| 103 |
"DataRef", "DtLnsIn", "DtLnOut", "DtMis", |
|---|
| 104 |
"IFtStal", "IFtcMis", "ILdStal", "ITlbMis", |
|---|
| 105 |
"L2IFtch", "L2Req", "L2Ld", "L2St", |
|---|
| 106 |
"PrefUp", |
|---|
| 107 |
"L2In", "L2Out", |
|---|
| 108 |
"Brnches", "BrTaken", "BrMisp", "BrTkMis", "BTBMiss", "BAClrs", |
|---|
| 109 |
"HwIntru", "SegRegL", |
|---|
| 110 |
"CallMis", "RetMisp", "FusdUop", "UopsUF", |
|---|
| 111 |
"Unhaltd", "SpdStep", "L2LdHw" // hardware prefetch |
|---|
| 112 |
]; |
|---|
| 113 |
|
|---|
| 114 |
// Notes: |
|---|
| 115 |
// 0xA0 is "unfused microoperations submitted to execution units (Undocumented counter!)"(Agner Fog). |
|---|
| 116 |
// SbDrains gives time lost to CPUID, interrupts, etc |
|---|
| 117 |
|
|---|
| 118 |
// uint[] regsToMeasure; |
|---|
| 119 |
public: |
|---|
| 120 |
void DoPerformanceTest(void delegate() testfunc, void delegate() initfunc, void delegate() nulltestfunc=null) |
|---|
| 121 |
{ |
|---|
| 122 |
void nulltest() { } |
|---|
| 123 |
if (nulltestfunc is null) nulltestfunc = &nulltest; |
|---|
| 124 |
|
|---|
| 125 |
SMSRInOut[4] startcmd; |
|---|
| 126 |
SMSRInOut[4] stopcmd; |
|---|
| 127 |
|
|---|
| 128 |
const int PerfEvtSel0 = 0x186; |
|---|
| 129 |
const int PerfEvtSel1 = 0x187; |
|---|
| 130 |
const int PerfCtr0 = 0xC1; |
|---|
| 131 |
const int PerfCtr1 = 0xC2; |
|---|
| 132 |
|
|---|
| 133 |
startcmd[0] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel0, 0); |
|---|
| 134 |
startcmd[1] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr0, 0L); |
|---|
| 135 |
startcmd[2] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel1, 0); |
|---|
| 136 |
startcmd[3] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr1, 0L); |
|---|
| 137 |
|
|---|
| 138 |
stopcmd[0] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel0, 0L); |
|---|
| 139 |
stopcmd[1] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr0, 0L); |
|---|
| 140 |
stopcmd[2] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfEvtSel1, 0L); |
|---|
| 141 |
stopcmd[3] = SMSRInOut(EMSR_COMMAND.MSR_WRITE, PerfCtr1, 0L); |
|---|
| 142 |
|
|---|
| 143 |
PerfCounterMeasurement[4][10] results; |
|---|
| 144 |
int standard_time_stamp = int.max; |
|---|
| 145 |
int standard_null_time_stamp = int.max; |
|---|
| 146 |
|
|---|
| 147 |
for (int countersetnum=0; countersetnum<regsToMeasure.length; countersetnum+=2) { |
|---|
| 148 |
// Bit 22 on PervEvtSel0 = enable counters. |
|---|
| 149 |
// Bit 16 = count non-OS events. Bit 17 = count OS events. |
|---|
| 150 |
// bit 18 = edge detect. Bit 24-31 = counter mask (only count multiple events per cycle). |
|---|
| 151 |
startcmd[0].value = regsToMeasure[countersetnum] | 0x0041_0000; |
|---|
| 152 |
startcmd[2].value = regsToMeasure[countersetnum+1] | 0x0001_0000; |
|---|
| 153 |
|
|---|
| 154 |
PerfCounterMeasurement minMeas; // smallest value for the test code |
|---|
| 155 |
PerfCounterMeasurement secondMeas; // candiate for minMeas |
|---|
| 156 |
PerfCounterMeasurement minNullMeas; // smallest value for the null code |
|---|
| 157 |
PerfCounterMeasurement secondNullMeas; // candiate for minNullMeas |
|---|
| 158 |
// The lowest value is usually, but not always, the best. |
|---|
| 159 |
// Occasionally, a measurement gives a spuriously low value. We guard against |
|---|
| 160 |
// this by choosing the smallest measurement which occurs more than once. |
|---|
| 161 |
int countMinTime = 0; // The number of measurements which have achieved the min time. |
|---|
| 162 |
int secondMinTime = 0; |
|---|
| 163 |
int countNullTime = 0; |
|---|
| 164 |
int secondNullTime = 0; |
|---|
| 165 |
|
|---|
| 166 |
do { |
|---|
| 167 |
driver.AccessRegisters(startcmd); |
|---|
| 168 |
for (int i=0; i<8;++i) { |
|---|
| 169 |
initfunc(); |
|---|
| 170 |
readCounters(&results[i][0]); |
|---|
| 171 |
testfunc(); |
|---|
| 172 |
readCounters(&results[i][1]); |
|---|
| 173 |
initfunc(); |
|---|
| 174 |
readCounters(&results[i][2]); |
|---|
| 175 |
nulltestfunc(); |
|---|
| 176 |
readCounters(&results[i][3]); |
|---|
| 177 |
} |
|---|
| 178 |
driver.AccessRegisters(stopcmd); |
|---|
| 179 |
minMeas.timestamp = uint.max; |
|---|
| 180 |
minNullMeas.timestamp = uint.max; |
|---|
| 181 |
countMinTime = 0; |
|---|
| 182 |
countNullTime = 0; |
|---|
| 183 |
secondMinTime = 0; |
|---|
| 184 |
secondNullTime = 0; |
|---|
| 185 |
for (int j=0; j<2;++j) { |
|---|
| 186 |
minMeas.perfCounter[j]=uint.max; |
|---|
| 187 |
secondMeas.perfCounter[j]=uint.max; |
|---|
| 188 |
minNullMeas.perfCounter[j]=uint.max; |
|---|
| 189 |
secondNullMeas.perfCounter[j]=uint.max; |
|---|
| 190 |
} |
|---|
| 191 |
// Start from end, since final measurements are usually most reliable. |
|---|
| 192 |
for (int i=7; i>=0; --i) { |
|---|
| 193 |
uint u = (results[i][1].timestamp-results[i][0].timestamp); |
|---|
| 194 |
if (u==minMeas.timestamp) ++countMinTime; |
|---|
| 195 |
if (u==secondMeas.timestamp) ++secondMinTime; |
|---|
| 196 |
else if (u < minMeas.timestamp) { |
|---|
| 197 |
if (countMinTime>0) { |
|---|
| 198 |
secondMinTime = countMinTime; |
|---|
| 199 |
secondMeas = minMeas; |
|---|
| 200 |
} |
|---|
| 201 |
minMeas.timestamp=u; |
|---|
| 202 |
countMinTime=1; |
|---|
| 203 |
} else if (u < secondMeas.timestamp) { |
|---|
| 204 |
secondMeas.timestamp = u; secondMinTime=1; |
|---|
| 205 |
} |
|---|
| 206 |
u = (results[i][3].timestamp-results[i][2].timestamp); |
|---|
| 207 |
if (u==minNullMeas.timestamp) ++countNullTime; |
|---|
| 208 |
if (u==secondNullMeas.timestamp) ++secondNullTime; |
|---|
| 209 |
if (u < minNullMeas.timestamp) { |
|---|
| 210 |
if (countNullTime>0) { |
|---|
| 211 |
secondNullTime = countNullTime; |
|---|
| 212 |
secondNullMeas = minNullMeas; |
|---|
| 213 |
} |
|---|
| 214 |
minNullMeas.timestamp=u; |
|---|
| 215 |
countNullTime = 1; |
|---|
| 216 |
} else if (u<secondNullMeas.timestamp) { |
|---|
| 217 |
secondNullMeas.timestamp = u; secondNullTime=1; |
|---|
| 218 |
} |
|---|
| 219 |
for (int j=0; j<2; ++j) { |
|---|
| 220 |
u = results[i][1].perfCounter[j]-results[i][0].perfCounter[j]; |
|---|
| 221 |
if (u<minMeas.perfCounter[j]) minMeas.perfCounter[j]=u; |
|---|
| 222 |
u = results[i][3].perfCounter[j]-results[i][2].perfCounter[j]; |
|---|
| 223 |
if (u<minNullMeas.perfCounter[j]) minNullMeas.perfCounter[j]=u; |
|---|
| 224 |
} |
|---|
| 225 |
} |
|---|
| 226 |
//if (countMinTime==0 && secondMinTime==0), we have a totally random result |
|---|
| 227 |
|
|---|
| 228 |
// The mintime must have occured at least twice, and there must not be two candidates. |
|---|
| 229 |
} while ((countMinTime<3 && secondMinTime<3) || (countMinTime>2 && secondMinTime>2) ||(countNullTime<3 && secondNullTime<3)); |
|---|
| 230 |
|
|---|
| 231 |
if (countMinTime<3 && secondMinTime>=3) { |
|---|
| 232 |
minMeas = secondMeas; |
|---|
| 233 |
} |
|---|
| 234 |
if (countNullTime<3 && secondNullTime>=3) { |
|---|
| 235 |
minNullMeas = secondNullMeas; |
|---|
| 236 |
} |
|---|
| 237 |
bool unusual = false; |
|---|
| 238 |
|
|---|
| 239 |
if (standard_time_stamp==int.max) { |
|---|
| 240 |
standard_time_stamp = minMeas.timestamp; |
|---|
| 241 |
standard_null_time_stamp = minNullMeas.timestamp; |
|---|
| 242 |
writefln("Standard=(%d, %d)",standard_time_stamp, standard_null_time_stamp); |
|---|
| 243 |
} else { |
|---|
| 244 |
if (standard_time_stamp != minMeas.timestamp || standard_null_time_stamp !=minNullMeas.timestamp) { |
|---|
| 245 |
writefln("Standard=(%d,%d), this=(%d,%d) second=(%d %d)", standard_time_stamp, standard_null_time_stamp, minMeas.timestamp,minNullMeas.timestamp, secondMeas.timestamp,secondNullMeas.timestamp); |
|---|
| 246 |
unusual = true; |
|---|
| 247 |
} |
|---|
| 248 |
} |
|---|
| 249 |
all.timestamp = standard_time_stamp-standard_null_time_stamp; //minMeas.timestamp - minNullMeas.timestamp; |
|---|
| 250 |
all.perfCounter[countersetnum] = minMeas.perfCounter[0]-minNullMeas.perfCounter[0]; |
|---|
| 251 |
all.perfCounter[countersetnum+1] = minMeas.perfCounter[1]-minNullMeas.perfCounter[1]; |
|---|
| 252 |
|
|---|
| 253 |
if (unusual) { |
|---|
| 254 |
writefln("Different timestamp: %s %s %d %d ####################", shortDesc[countersetnum*2], shortDesc[countersetnum], all.timestamp, minMeas.timestamp - minNullMeas.timestamp); |
|---|
| 255 |
for (int i=0; i<8; ++i) { |
|---|
| 256 |
writefln("%d --> %d,%d", i, results[i][1].timestamp-results[i][0].timestamp, results[i][3].timestamp-results[i][2].timestamp); |
|---|
| 257 |
} |
|---|
| 258 |
} |
|---|
| 259 |
} |
|---|
| 260 |
} |
|---|
| 261 |
|
|---|
| 262 |
void describe() |
|---|
| 263 |
{ |
|---|
| 264 |
writefln("ALL RESULTS"); |
|---|
| 265 |
char [] heading = "Ticks" ~ \t; |
|---|
| 266 |
char [] vals = std.string.toString(all.timestamp) ~ \t; |
|---|
| 267 |
int kol=1; |
|---|
| 268 |
for(int i=0; i<shortDesc.length; ++i) { |
|---|
| 269 |
heading ~= shortDesc[i]~ \t; |
|---|
| 270 |
vals ~= std.string.toString(all.perfCounter[i])~\t; |
|---|
| 271 |
++kol; |
|---|
| 272 |
if (kol==8 || i==shortDesc.length-1) { |
|---|
| 273 |
writefln(heading); writefln(vals); heading=""; vals=""; kol=0; |
|---|
| 274 |
} |
|---|
| 275 |
} |
|---|
| 276 |
} |
|---|
| 277 |
} |
|---|