 |
|
File Cpuid.patch, 47.4 kB
(added by Wazar, 2 years ago)
|
|
-
/root/buff/trunk/tango/core/tools/Cpuid.d
| old |
new |
|
| 36 | 36 | |
|---|
| 37 | 37 | module tango.core.tools.Cpuid; |
|---|
| 38 | 38 | |
|---|
| | 39 | import tango.stdc.string: memcmp; |
|---|
| 39 | 40 | // If optimizing for a particular processor, it is generally better |
|---|
| 40 | 41 | // to identify based on features rather than model. NOTE: Normally |
|---|
| 41 | 42 | // it's only worthwhile to optimise for the latest Intel and AMD CPU, |
| … | … | |
| 58 | 59 | // Cyrix 6x86 -- preferPentium1() |
|---|
| 59 | 60 | // 6x86MX -- + mmx() |
|---|
| 60 | 61 | |
|---|
| 61 | | public: |
|---|
| 62 | | |
|---|
| 63 | | /// Cache size and behaviour |
|---|
| 64 | | struct CacheInfo |
|---|
| 65 | | { |
|---|
| 66 | | /// Size of the cache, in kilobytes, per CPU. |
|---|
| 67 | | /// For L1 unified (data + code) caches, this size is half the physical size. |
|---|
| 68 | | /// (we don't halve it for larger sizes, since normally |
|---|
| 69 | | /// data size is much greater than code size for critical loops). |
|---|
| 70 | | uint size; |
|---|
| 71 | | /// Number of ways of associativity, eg: |
|---|
| 72 | | /// 1 = direct mapped |
|---|
| 73 | | /// 2 = 2-way set associative |
|---|
| 74 | | /// 3 = 3-way set associative |
|---|
| 75 | | /// ubyte.max = fully associative |
|---|
| 76 | | ubyte associativity; |
|---|
| 77 | | /// Number of bytes read into the cache when a cache miss occurs. |
|---|
| 78 | | uint lineSize; |
|---|
| 79 | | } |
|---|
| 80 | | |
|---|
| 81 | | public: |
|---|
| 82 | | /// Returns vendor string, for display purposes only. |
|---|
| 83 | | /// Do NOT use this to determine features! |
|---|
| 84 | | /// Note that some CPUs have programmable vendorIDs. |
|---|
| 85 | | char[] vendor() {return vendorID;} |
|---|
| 86 | | /// Returns processor string, for display purposes only |
|---|
| 87 | | char[] processor() {return processorName;} |
|---|
| 88 | | |
|---|
| 89 | | /// The data caches. If there are fewer than 5 physical caches levels, |
|---|
| 90 | | /// the remaining levels are set to uint.max (== entire memory space) |
|---|
| 91 | | CacheInfo[5] datacache; |
|---|
| 92 | | /// Does it have an x87 FPU on-chip? |
|---|
| 93 | | bool x87onChip() {return (features&FPU_BIT)!=0;} |
|---|
| 94 | | /// Is MMX supported? |
|---|
| 95 | | bool mmx() {return (features&MMX_BIT)!=0;} |
|---|
| 96 | | /// Is SSE supported? |
|---|
| 97 | | bool sse() {return (features&SSE_BIT)!=0;} |
|---|
| 98 | | /// Is SSE2 supported? |
|---|
| 99 | | bool sse2() {return (features&SSE2_BIT)!=0;} |
|---|
| 100 | | /// Is SSE3 supported? |
|---|
| 101 | | bool sse3() {return (miscfeatures&SSE3_BIT)!=0;} |
|---|
| 102 | | /// Is SSSE3 supported? |
|---|
| 103 | | bool ssse3() {return (miscfeatures&SSSE3_BIT)!=0;} |
|---|
| 104 | | /// Is SSE4.1 supported? |
|---|
| 105 | | bool sse41() {return (miscfeatures&SSE41_BIT)!=0;} |
|---|
| 106 | | /// Is SSE4.2 supported? |
|---|
| 107 | | bool sse42() {return (miscfeatures&SSE42_BIT)!=0;} |
|---|
| 108 | | /// Is SSE4a supported? |
|---|
| 109 | | bool sse4a() {return (amdmiscfeatures&SSE4A_BIT)!=0;} |
|---|
| 110 | | /// Is SSE5 supported? |
|---|
| 111 | | bool sse5() {return (amdmiscfeatures&SSE5_BIT)!=0;} |
|---|
| 112 | | /// Is AMD 3DNOW supported? |
|---|
| 113 | | bool amd3dnow() {return (amdfeatures&AMD_3DNOW_BIT)!=0;} |
|---|
| 114 | | /// Is AMD 3DNOW Ext supported? |
|---|
| 115 | | bool amd3dnowExt() {return (amdfeatures&AMD_3DNOW_EXT_BIT)!=0;} |
|---|
| 116 | | /// Are AMD extensions to MMX supported? |
|---|
| 117 | | bool amdMmx() {return (amdfeatures&AMD_MMX_BIT)!=0;} |
|---|
| 118 | | /// Is fxsave/fxrstor supported? |
|---|
| 119 | | bool hasFxsr() {return (features&FXSR_BIT)!=0;} |
|---|
| 120 | | /// Is cmov supported? |
|---|
| 121 | | bool hasCmov() {return (features&CMOV_BIT)!=0;} |
|---|
| 122 | | /// Is rdtsc supported? |
|---|
| 123 | | bool hasRdtsc() {return (features&TIMESTAMP_BIT)!=0;} |
|---|
| 124 | | /// Is cmpxchg8b supported? |
|---|
| 125 | | bool hasCmpxchg8b() {return (features&CMPXCHG8B_BIT)!=0;} |
|---|
| 126 | | /// Is cmpxchg8b supported? |
|---|
| 127 | | bool hasCmpxchg16b() {return (miscfeatures&CMPXCHG16B_BIT)!=0;} |
|---|
| 128 | | /// Is 3DNow prefetch supported? |
|---|
| 129 | | bool has3dnowPrefetch() |
|---|
| 130 | | {return (amdmiscfeatures&AMD_3DNOW_PREFETCH_BIT)!=0;} |
|---|
| 131 | | /// Are LAHF and SAHF supported in 64-bit mode? |
|---|
| 132 | | bool hasLahfSahf() {return (amdmiscfeatures&LAHFSAHF_BIT)!=0;} |
|---|
| 133 | | /// Is POPCNT supported? |
|---|
| 134 | | bool hasPopcnt() {return (miscfeatures&POPCNT_BIT)!=0;} |
|---|
| 135 | | /// Is LZCNT supported? |
|---|
| 136 | | bool hasLzcnt() {return (amdmiscfeatures&LZCNT_BIT)!=0;} |
|---|
| 137 | | /// Is this an Intel64 or AMD 64? |
|---|
| 138 | | bool isX86_64() {return (amdfeatures&AMD64_BIT)!=0;} |
|---|
| 139 | | |
|---|
| 140 | | /// Is this an IA64 (Itanium) processor? |
|---|
| 141 | | bool isItanium() { return (features&IA64_BIT)!=0; } |
|---|
| 142 | | |
|---|
| 143 | | /// Is hyperthreading supported? |
|---|
| 144 | | bool hyperThreading() { return maxThreads>maxCores; } |
|---|
| 145 | | /// Returns number of threads per CPU |
|---|
| 146 | | uint threadsPerCPU() {return maxThreads;} |
|---|
| 147 | | /// Returns number of cores in CPU |
|---|
| 148 | | uint coresPerCPU() {return maxCores;} |
|---|
| 149 | | |
|---|
| 150 | | /// Optimisation hints for assembly code. |
|---|
| 151 | | /// For forward compatibility, the CPU is compared against different |
|---|
| 152 | | /// microarchitectures. For 32-bit X86, comparisons are made against |
|---|
| 153 | | /// the Intel PPro/PII/PIII/PM family. |
|---|
| 154 | | /// |
|---|
| 155 | | /// The major 32-bit x86 microarchitecture 'dynasties' have been: |
|---|
| 156 | | /// (1) Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2). |
|---|
| 157 | | /// (2) AMD Athlon (K7, K8, K10). |
|---|
| 158 | | /// (3) Intel NetBurst (Pentium 4, Pentium D). |
|---|
| 159 | | /// (4) In-order Pentium (Pentium1, PMMX) |
|---|
| 160 | | /// Other early CPUs (Nx586, AMD K5, K6, Centaur C3, Transmeta, |
|---|
| 161 | | /// Cyrix, Rise) were mostly in-order. |
|---|
| 162 | | /// Some new processors do not fit into the existing categories: |
|---|
| 163 | | /// Intel Atom 230/330 (family 6, model 0x1C) is an in-order core. |
|---|
| 164 | | /// Centaur Isiah = VIA Nano (family 6, model F) is an out-of-order core. |
|---|
| 165 | | /// |
|---|
| 166 | | /// Within each dynasty, the optimisation techniques are largely |
|---|
| 167 | | /// identical (eg, use instruction pairing for group 4). Major |
|---|
| 168 | | /// instruction set improvements occur within each group. |
|---|
| 169 | | |
|---|
| 170 | | /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code? |
|---|
| 171 | | bool preferAthlon() { return probablyAMD && family >=6; } |
|---|
| 172 | | /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code? |
|---|
| 173 | | bool preferPentium4() { return probablyIntel && family == 0xF; } |
|---|
| 174 | | /// Does this CPU perform better on Pentium I code than Pentium Pro code? |
|---|
| 175 | | bool preferPentium1() { return family < 6 || (family==6 && model < 0xF && !probablyIntel); } |
|---|
| 176 | | |
|---|
| 177 | | public: |
|---|
| 178 | | /// Processor type (vendor-dependent). |
|---|
| 179 | | /// This should be visible ONLY for display purposes. |
|---|
| 180 | | uint stepping, model, family; |
|---|
| 181 | | uint numCacheLevels = 1; |
|---|
| 182 | | private: |
|---|
| 183 | | bool probablyIntel; // true = _probably_ an Intel processor, might be faking |
|---|
| 184 | | bool probablyAMD; // true = _probably_ an AMD processor |
|---|
| 185 | | char [12] vendorID; |
|---|
| 186 | | char [] processorName; |
|---|
| 187 | | char [48] processorNameBuffer; |
|---|
| 188 | | uint features = 0; // mmx, sse, sse2, hyperthreading, etc |
|---|
| 189 | | uint miscfeatures = 0; // sse3, etc. |
|---|
| 190 | | uint amdfeatures = 0; // 3DNow!, mmxext, etc |
|---|
| 191 | | uint amdmiscfeatures = 0; // sse4a, sse5, svm, etc |
|---|
| 192 | | uint maxCores = 1; |
|---|
| 193 | | uint maxThreads = 1; |
|---|
| 194 | | // Note that this may indicate multi-core rather than hyperthreading. |
|---|
| 195 | | bool hyperThreadingBit() { return (features&HTT_BIT)!=0;} |
|---|
| 196 | | |
|---|
| 197 | | // feature flags CPUID1_EDX |
|---|
| 198 | | enum : uint |
|---|
| 199 | | { |
|---|
| 200 | | FPU_BIT = 1, |
|---|
| 201 | | TIMESTAMP_BIT = 1<<4, // rdtsc |
|---|
| 202 | | MDSR_BIT = 1<<5, // RDMSR/WRMSR |
|---|
| 203 | | CMPXCHG8B_BIT = 1<<8, |
|---|
| 204 | | CMOV_BIT = 1<<15, |
|---|
| 205 | | MMX_BIT = 1<<23, |
|---|
| 206 | | FXSR_BIT = 1<<24, |
|---|
| 207 | | SSE_BIT = 1<<25, |
|---|
| 208 | | SSE2_BIT = 1<<26, |
|---|
| 209 | | HTT_BIT = 1<<28, |
|---|
| 210 | | IA64_BIT = 1<<30 |
|---|
| 211 | | } |
|---|
| 212 | | // feature flags misc CPUID1_ECX |
|---|
| 213 | | enum : uint |
|---|
| 214 | | { |
|---|
| 215 | | SSE3_BIT = 1, |
|---|
| 216 | | PCLMULQDQ_BIT = 1<<1, // from AVX |
|---|
| 217 | | MWAIT_BIT = 1<<3, |
|---|
| 218 | | SSSE3_BIT = 1<<9, |
|---|
| 219 | | FMA_BIT = 1<<12, // from AVX |
|---|
| 220 | | CMPXCHG16B_BIT = 1<<13, |
|---|
| 221 | | SSE41_BIT = 1<<19, |
|---|
| 222 | | SSE42_BIT = 1<<20, |
|---|
| 223 | | POPCNT_BIT = 1<<23, |
|---|
| 224 | | AES_BIT = 1<<25, // AES instructions from AVX |
|---|
| 225 | | OSXSAVE_BIT = 1<<27, // Used for AVX |
|---|
| 226 | | AVX_BIT = 1<<28 |
|---|
| 227 | | } |
|---|
| 228 | | /+ |
|---|
| 229 | | version(X86_64) { |
|---|
| 230 | | bool hasAVXinHardware() { |
|---|
| 231 | | // This only indicates hardware support, not OS support. |
|---|
| 232 | | return (miscfeatures&AVX_BIT) && (miscfeatures&OSXSAVE_BIT); |
|---|
| 233 | | } |
|---|
| 234 | | // Is AVX supported (in both hardware & OS)? |
|---|
| 235 | | bool Avx() { |
|---|
| 236 | | if (!hasAVXinHardware()) return false; |
|---|
| 237 | | // Check for OS support |
|---|
| 238 | | uint xfeatures; |
|---|
| 239 | | asm {mov ECX, 0; xgetbv; mov xfeatures, EAX; } |
|---|
| 240 | | return (xfeatures&0x6)==6; |
|---|
| 241 | | } |
|---|
| 242 | | bool hasAvxFma() { |
|---|
| 243 | | if (!AVX()) return false; |
|---|
| 244 | | return (features&FMA_BIT)!=0; |
|---|
| 245 | | } |
|---|
| 246 | | } |
|---|
| 247 | | +/ |
|---|
| 248 | | // AMD feature flags CPUID80000001_EDX |
|---|
| 249 | | enum : uint |
|---|
| 250 | | { |
|---|
| 251 | | AMD_MMX_BIT = 1<<22, |
|---|
| 252 | | // FXR_OR_CYRIXMMX_BIT = 1<<24, // Cyrix/NS: 6x86MMX instructions. |
|---|
| 253 | | FFXSR_BIT = 1<<25, |
|---|
| 254 | | PAGE1GB_BIT = 1<<26, // support for 1GB pages |
|---|
| 255 | | RDTSCP_BIT = 1<<27, |
|---|
| 256 | | AMD64_BIT = 1<<29, |
|---|
| 257 | | AMD_3DNOW_EXT_BIT = 1<<30, |
|---|
| 258 | | AMD_3DNOW_BIT = 1<<31 |
|---|
| 259 | | } |
|---|
| 260 | | // AMD misc feature flags CPUID80000001_ECX |
|---|
| 261 | | enum : uint |
|---|
| 262 | | { |
|---|
| 263 | | LAHFSAHF_BIT = 1, |
|---|
| 264 | | LZCNT_BIT = 1<<5, |
|---|
| 265 | | SSE4A_BIT = 1<<6, |
|---|
| 266 | | AMD_3DNOW_PREFETCH_BIT = 1<<8, |
|---|
| 267 | | SSE5_BIT = 1<<11 |
|---|
| 268 | | } |
|---|
| 269 | | |
|---|
| 270 | | version(GNU){ |
|---|
| 271 | | // GDC is a filthy liar. It can't actually do inline asm. |
|---|
| 272 | | } else version(D_InlineAsm_X86) { |
|---|
| 273 | | version = Really_D_InlineAsm_X86; |
|---|
| 274 | | } |
|---|
| 275 | | |
|---|
| 276 | | version(Really_D_InlineAsm_X86) { |
|---|
| 277 | | // Note that this code will also work for Itanium, after changing the |
|---|
| 278 | | // register names in the asm code. |
|---|
| 279 | | |
|---|
| 280 | | uint max_cpuid, max_extended_cpuid; |
|---|
| 281 | | |
|---|
| 282 | | // CPUID2: "cache and tlb information" |
|---|
| 283 | | void getcacheinfoCPUID2() |
|---|
| 284 | | { |
|---|
| 285 | | // CPUID2 is a dog's breakfast. What was Intel thinking??? |
|---|
| 286 | | // We are only interested in the data caches |
|---|
| 287 | | void decipherCpuid2(ubyte x) { |
|---|
| 288 | | if (x==0) return; |
|---|
| 289 | | // Values from http://www.sandpile.org/ia32/cpuid.htm. |
|---|
| 290 | | // Includes Itanium and non-Intel CPUs. |
|---|
| 291 | | // |
|---|
| 292 | | ubyte [] ids = [ |
|---|
| 293 | | 0x0A, 0x0C, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68, |
|---|
| 294 | | // level 2 cache |
|---|
| 295 | | 0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F, |
|---|
| 296 | | 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E, |
|---|
| 297 | | 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81, |
|---|
| 298 | | // level 3 cache |
|---|
| 299 | | 0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D |
|---|
| 300 | | ]; |
|---|
| 301 | | uint [] sizes = [ |
|---|
| 302 | | 8, 16, 32, 16, 24, 8, 16, 32, |
|---|
| 303 | | 128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512, |
|---|
| 304 | | 256, 512, 1024, 2048, 512, 1024, 4096, 6*1024, |
|---|
| 305 | | 128, 192, 128, 256, 384, 512, 3072, 512, 128, |
|---|
| 306 | | 512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024 |
|---|
| 307 | | ]; |
|---|
| 308 | | // CPUBUG: Pentium M reports 0x2C but tests show it is only 4-way associative |
|---|
| 309 | | ubyte [] ways = [ |
|---|
| 310 | | 2, 4, 8, 8, 6, 4, 4, 4, |
|---|
| 311 | | 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2, |
|---|
| 312 | | 8, 8, 8, 8, 4, 8, 16, 24, |
|---|
| 313 | | 4, 6, 2, 4, 6, 4, 12, 8, 8, |
|---|
| 314 | | 4, 8, 8, 8, 4, 8, 12, 16, 12, 16 |
|---|
| 315 | | ]; |
|---|
| 316 | | enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 } |
|---|
| 317 | | for (int i=0; i< ids.length; ++i) { |
|---|
| 318 | | if (x==ids[i]) { |
|---|
| 319 | | int level = i< FIRSTDATA2 ? 0: i<FIRSTDATA3 ? 1 : 2; |
|---|
| 320 | | if (x==0x49 && family==0xF && model==0x6) level=2; |
|---|
| 321 | | datacache[level].size=sizes[i]; |
|---|
| 322 | | datacache[level].associativity=ways[i]; |
|---|
| 323 | | if (level == 3 || x==0x2C || (x>=0x48 && x<=0x80) |
|---|
| 324 | | || x==0x86 || x==0x87 |
|---|
| 325 | | || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E) ){ |
|---|
| 326 | | datacache[level].lineSize = 64; |
|---|
| 327 | | } else datacache[level].lineSize = 32; |
|---|
| 328 | | } |
|---|
| 329 | | } |
|---|
| 330 | | } |
|---|
| 331 | | |
|---|
| 332 | | uint[4] a; |
|---|
| 333 | | bool firstTime = true; |
|---|
| 334 | | // On a multi-core system, this could theoretically fail, but it's only used |
|---|
| 335 | | // for old single-core CPUs. |
|---|
| 336 | | uint numinfos = 1; |
|---|
| 337 | | do { |
|---|
| 338 | | asm { |
|---|
| 339 | | mov EAX, 2; |
|---|
| 340 | | cpuid; |
|---|
| 341 | | mov a, EAX; |
|---|
| 342 | | mov a+4, EBX; |
|---|
| 343 | | mov a+8, ECX; |
|---|
| 344 | | mov a+12, EDX; |
|---|
| 345 | | } |
|---|
| 346 | | if (firstTime) { |
|---|
| 347 | | if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) { |
|---|
| 348 | | // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080. |
|---|
| 349 | | // These are NOT standard Intel values |
|---|
| 350 | | // (TLB = 32 entry, 4 way associative, 4K pages) |
|---|
| 351 | | // (L1 cache = 16K, 4way, linesize16) |
|---|
| 352 | | datacache[0].size=8; |
|---|
| 353 | | datacache[0].associativity=4; |
|---|
| 354 | | datacache[0].lineSize=16; |
|---|
| 355 | | return; |
|---|
| 356 | | } |
|---|
| 357 | | // lsb of a is how many times to loop. |
|---|
| 358 | | numinfos = a[0] & 0xFF; |
|---|
| 359 | | // and otherwise it should be ignored |
|---|
| 360 | | a[0] &= 0xFFFF_FF00; |
|---|
| 361 | | firstTime = false; |
|---|
| 362 | | } |
|---|
| 363 | | for (int c=0; c<4;++c) { |
|---|
| 364 | | // high bit set == no info. |
|---|
| 365 | | if (a[c] & 0x8000_0000) continue; |
|---|
| 366 | | decipherCpuid2(cast(ubyte)(a[c] & 0xFF)); |
|---|
| 367 | | decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF)); |
|---|
| 368 | | decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF)); |
|---|
| 369 | | decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF)); |
|---|
| 370 | | } |
|---|
| 371 | | } while (--numinfos); |
|---|
| 372 | | } |
|---|
| 373 | | |
|---|
| 374 | | // CPUID4: "Deterministic cache parameters" leaf |
|---|
| 375 | | void getcacheinfoCPUID4() |
|---|
| 376 | | { |
|---|
| 377 | | int cachenum = 0; |
|---|
| 378 | | for(;;) { |
|---|
| 379 | | uint a, b, number_of_sets; |
|---|
| 380 | | asm { |
|---|
| 381 | | mov EAX, 4; |
|---|
| 382 | | mov ECX, cachenum; |
|---|
| 383 | | cpuid; |
|---|
| 384 | | mov a, EAX; |
|---|
| 385 | | mov b, EBX; |
|---|
| 386 | | mov number_of_sets, ECX; |
|---|
| 387 | | } |
|---|
| 388 | | ++cachenum; |
|---|
| 389 | | if ((a&0x1F)==0) break; // no more caches |
|---|
| 390 | | uint numthreads = ((a>>14) & 0xFFF) + 1; |
|---|
| 391 | | uint numcores = ((a>>26) & 0x3F) + 1; |
|---|
| 392 | | if (numcores > maxCores) maxCores = numcores; |
|---|
| 393 | | if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches |
|---|
| 394 | | |
|---|
| 395 | | ++number_of_sets; |
|---|
| 396 | | ubyte level = cast(ubyte)(((a>>5)&7)-1); |
|---|
| 397 | | if (level > datacache.length) continue; // ignore deep caches |
|---|
| 398 | | datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1); |
|---|
| 399 | | datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size |
|---|
| 400 | | uint line_partitions = ((b >> 12)& 0x3FF) + 1; |
|---|
| 401 | | // Size = number of sets * associativity * cachelinesize * linepartitions |
|---|
| 402 | | // and must convert to Kb, also dividing by the number of cores. |
|---|
| 403 | | ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets * |
|---|
| 404 | | datacache[level].associativity : number_of_sets; |
|---|
| 405 | | datacache[level].size = cast(uint)( |
|---|
| 406 | | (sz * datacache[level].lineSize * line_partitions ) / (numcores *1024)); |
|---|
| 407 | | if (level == 0 && (a&0xF)==3) { |
|---|
| 408 | | // Halve the size for unified L1 caches |
|---|
| 409 | | datacache[level].size/=2; |
|---|
| 410 | | } |
|---|
| 411 | | } |
|---|
| 412 | | } |
|---|
| 413 | | |
|---|
| 414 | | // CPUID8000_0005 & 6 |
|---|
| 415 | | void getAMDcacheinfo() |
|---|
| 416 | | { |
|---|
| 417 | | uint c5, c6, d6; |
|---|
| 418 | | asm { |
|---|
| 419 | | mov EAX, 0x8000_0005; // L1 cache |
|---|
| 420 | | cpuid; |
|---|
| 421 | | // EAX has L1_TLB_4M. |
|---|
| 422 | | // EBX has L1_TLB_4K |
|---|
| 423 | | // EDX has L1 instruction cache |
|---|
| 424 | | mov c5, ECX; |
|---|
| 425 | | } |
|---|
| 426 | | |
|---|
| 427 | | datacache[0].size = ( (c5>>24) & 0xFF); |
|---|
| 428 | | datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF); |
|---|
| 429 | | datacache[0].lineSize = c5 & 0xFF; |
|---|
| 430 | | |
|---|
| 431 | | if (max_extended_cpuid >= 0x8000_0006) { |
|---|
| 432 | | // AMD K6-III or K6-2+ or later. |
|---|
| 433 | | ubyte numcores = 1; |
|---|
| 434 | | if (max_extended_cpuid >=0x8000_0008) { |
|---|
| 435 | | asm { |
|---|
| 436 | | mov EAX, 0x8000_0008; |
|---|
| 437 | | cpuid; |
|---|
| 438 | | mov numcores, CL; |
|---|
| 439 | | } |
|---|
| 440 | | ++numcores; |
|---|
| 441 | | if (numcores>maxCores) maxCores = numcores; |
|---|
| 442 | | } |
|---|
| 443 | | asm { |
|---|
| 444 | | mov EAX, 0x8000_0006; // L2/L3 cache |
|---|
| 445 | | cpuid; |
|---|
| 446 | | mov c6, ECX; // L2 cache info |
|---|
| 447 | | mov d6, EDX; // L3 cache info |
|---|
| 448 | | } |
|---|
| 449 | | |
|---|
| 450 | | ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ]; |
|---|
| 451 | | datacache[1].size = (c6>>16) & 0xFFFF; |
|---|
| 452 | | datacache[1].associativity = assocmap[(c6>>12)&0xF]; |
|---|
| 453 | | datacache[1].lineSize = c6 & 0xFF; |
|---|
| 454 | | |
|---|
| 455 | | // The L3 cache value is TOTAL, not per core. |
|---|
| 456 | | datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1. |
|---|
| 457 | | datacache[2].associativity = assocmap[(d6>>12)&0xF]; |
|---|
| 458 | | datacache[2].lineSize = d6 & 0xFF; |
|---|
| 459 | | } |
|---|
| 460 | | } |
|---|
| 461 | 62 | |
|---|
| 462 | | |
|---|
| 463 | | void cpuidX86() |
|---|
| | 63 | version(GNU) |
|---|
| 464 | 64 | { |
|---|
| 465 | | char * venptr = vendorID.ptr; |
|---|
| 466 | | asm { |
|---|
| 467 | | mov EAX, 0; |
|---|
| 468 | | cpuid; |
|---|
| 469 | | mov max_cpuid, EAX; |
|---|
| 470 | | mov EAX, venptr; |
|---|
| 471 | | mov [EAX], EBX; |
|---|
| 472 | | mov [EAX + 4], EDX; |
|---|
| 473 | | mov [EAX + 8], ECX; |
|---|
| 474 | | mov EAX, 0x8000_0000; |
|---|
| 475 | | cpuid; |
|---|
| 476 | | mov max_extended_cpuid, EAX; |
|---|
| 477 | | } |
|---|
| 478 | | |
|---|
| 479 | | probablyIntel = vendorID == "GenuineIntel"; |
|---|
| 480 | | probablyAMD = vendorID == "AuthenticAMD"; |
|---|
| 481 | | uint a, b, c, d; |
|---|
| 482 | | uint apic = 0; // brand index, apic id |
|---|
| 483 | | asm { |
|---|
| 484 | | mov EAX, 1; // model, stepping |
|---|
| 485 | | cpuid; |
|---|
| 486 | | mov a, EAX; |
|---|
| 487 | | mov apic, EBX; |
|---|
| 488 | | mov miscfeatures, ECX; |
|---|
| 489 | | mov features, EDX; |
|---|
| 490 | | } |
|---|
| 491 | | amdfeatures = 0; |
|---|
| 492 | | amdmiscfeatures = 0; |
|---|
| 493 | | if (max_extended_cpuid >= 0x8000_0001) { |
|---|
| 494 | | asm { |
|---|
| 495 | | mov EAX, 0x8000_0001; |
|---|
| 496 | | cpuid; |
|---|
| 497 | | mov amdmiscfeatures, ECX; |
|---|
| 498 | | mov amdfeatures, EDX; |
|---|
| 499 | | } |
|---|
| 500 | | } |
|---|
| 501 | | // Try to detect fraudulent vendorIDs |
|---|
| 502 | | if (amd3dnow) probablyIntel = false; |
|---|
| 503 | | |
|---|
| 504 | | stepping = a & 0xF; |
|---|
| 505 | | uint fbase = (a >> 8) & 0xF; |
|---|
| 506 | | uint mbase = (a >> 4) & 0xF; |
|---|
| 507 | | family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase; |
|---|
| 508 | | model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ? |
|---|
| 509 | | mbase + ((a >> 12) & 0xF0) : mbase; |
|---|
| 510 | | |
|---|
| 511 | | if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) { |
|---|
| 512 | | // determine max number of cores for AMD |
|---|
| 513 | | asm { |
|---|
| 514 | | mov EAX, 0x8000_0008; |
|---|
| 515 | | cpuid; |
|---|
| 516 | | mov c, ECX; |
|---|
| 517 | | } |
|---|
| 518 | | uint apicsize = (c>>12) & 0xF; |
|---|
| 519 | | if (apicsize == 0) { |
|---|
| 520 | | // use legacy method |
|---|
| 521 | | if (hyperThreadingBit) maxCores = c & 0xFF; |
|---|
| 522 | | else maxCores = 1; |
|---|
| 523 | | } else { |
|---|
| 524 | | // maxcores = 2^ apicsize |
|---|
| 525 | | maxCores = 1; |
|---|
| 526 | | while (apicsize) { maxCores<<=1; --apicsize; } |
|---|
| 527 | | } |
|---|
| 528 | | } |
|---|
| 529 | | |
|---|
| 530 | | if (max_extended_cpuid >= 0x8000_0004) { |
|---|
| 531 | | char *procptr = processorNameBuffer.ptr; |
|---|
| 532 | | asm { |
|---|
| 533 | | push ESI; |
|---|
| 534 | | mov ESI, procptr; |
|---|
| 535 | | mov EAX, 0x8000_0002; |
|---|
| 536 | | cpuid; |
|---|
| 537 | | mov [ESI], EAX; |
|---|
| 538 | | mov [ESI+4], EBX; |
|---|
| 539 | | mov [ESI+8], ECX; |
|---|
| 540 | | mov [ESI+12], EDX; |
|---|
| 541 | | mov EAX, 0x8000_0003; |
|---|
| 542 | | cpuid; |
|---|
| 543 | | mov [ESI+16], EAX; |
|---|
| 544 | | mov [ESI+20], EBX; |
|---|
| 545 | | mov [ESI+24], ECX; |
|---|
| 546 | | mov [ESI+28], EDX; |
|---|
| 547 | | mov EAX, 0x8000_0004; |
|---|
| 548 | | cpuid; |
|---|
| 549 | | mov [ESI+32], EAX; |
|---|
| 550 | | mov [ESI+36], EBX; |
|---|
| 551 | | mov [ESI+40], ECX; |
|---|
| 552 | | mov [ESI+44], EDX; |
|---|
| 553 | | pop ESI; |
|---|
| 554 | | } |
|---|
| 555 | | // Intel P4 and PM pad at front with spaces. |
|---|
| 556 | | // Other CPUs pad at end with nulls. |
|---|
| 557 | | int start = 0, end = 0; |
|---|
| 558 | | while (processorNameBuffer[start] == ' ') { ++start; } |
|---|
| 559 | | while (processorNameBuffer[$-end-1] == 0) { ++end; } |
|---|
| 560 | | processorName = processorNameBuffer[start..$-end]; |
|---|
| 561 | | } else { |
|---|
| 562 | | processorName = "Unknown CPU"; |
|---|
| 563 | | } |
|---|
| 564 | | // Determine cache sizes |
|---|
| 565 | | |
|---|
| 566 | | // Intel docs specify that they return 0 for 0x8000_0005. |
|---|
| 567 | | // AMD docs do not specify the behaviour for 0004 and 0002. |
|---|
| 568 | | // Centaur/VIA and most other manufacturers use the AMD method, |
|---|
| 569 | | // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2! |
|---|
| 570 | | // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour |
|---|
| 571 | | // for CPUID80000005. But Geode GX uses the AMD method |
|---|
| 572 | | |
|---|
| 573 | | // Deal with idiotic Geode GX1 - make it same as MediaGX MMX. |
|---|
| 574 | | if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) { |
|---|
| 575 | | max_extended_cpuid = 0x8000_0004; |
|---|
| 576 | | } |
|---|
| 577 | | // Therefore, we try the AMD method unless it's an Intel chip. |
|---|
| 578 | | // If we still have no info, try the Intel methods. |
|---|
| 579 | | datacache[0].size = 0; |
|---|
| 580 | | if (max_cpuid<2 || !probablyIntel) { |
|---|
| 581 | | if (max_extended_cpuid >= 0x8000_0005) { |
|---|
| 582 | | getAMDcacheinfo(); |
|---|
| 583 | | } else if (probablyAMD) { |
|---|
| 584 | | // According to AMDProcRecognitionAppNote, this means CPU |
|---|
| 585 | | // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4) |
|---|
| 586 | | // Am5x86 has 16Kb 4-way unified data & code cache. |
|---|
| 587 | | datacache[0].size = 8; |
|---|
| 588 | | datacache[0].associativity = 4; |
|---|
| 589 | | datacache[0].lineSize = 32; |
|---|
| 590 | | } else { |
|---|
| 591 | | // Some obscure CPU. |
|---|
| 592 | | // Values for Cyrix 6x86MX (family 6, model 0) |
|---|
| 593 | | datacache[0].size = 64; |
|---|
| 594 | | datacache[0].associativity = 4; |
|---|
| 595 | | datacache[0].lineSize = 32; |
|---|
| 596 | | } |
|---|
| 597 | | } |
|---|
| 598 | | if ((datacache[0].size == 0) && max_cpuid>=4) { |
|---|
| 599 | | getcacheinfoCPUID4(); |
|---|
| 600 | | } |
|---|
| 601 | | if ((datacache[0].size == 0) && max_cpuid>=2) { |
|---|
| 602 | | getcacheinfoCPUID2(); |
|---|
| 603 | | } |
|---|
| 604 | | if (datacache[0].size == 0) { |
|---|
| 605 | | // Pentium, PMMX, late model 486, or an obscure CPU |
|---|
| 606 | | if (mmx) { // Pentium MMX. Also has 8kB code cache. |
|---|
| 607 | | datacache[0].size = 16; |
|---|
| 608 | | datacache[0].associativity = 4; |
|---|
| 609 | | datacache[0].lineSize = 32; |
|---|
| 610 | | } else { // Pentium 1 (which also has 8kB code cache) |
|---|
| 611 | | // or 486. |
|---|
| 612 | | // Cyrix 6x86: 16, 4way, 32 linesize |
|---|
| 613 | | datacache[0].size = 8; |
|---|
| 614 | | datacache[0].associativity = 2; |
|---|
| 615 | | datacache[0].lineSize = 32; |
|---|
| 616 | | } |
|---|
| 617 | | } |
|---|
| 618 | | if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF; |
|---|
| 619 | | else maxThreads = maxCores; |
|---|
| 620 | | } |
|---|
| 621 | | |
|---|
| 622 | | // Return true if the cpuid instruction is supported. |
|---|
| 623 | | // BUG(WONTFIX): Doesn't work for Cyrix 6x86 and 6x86L. |
|---|
| 624 | | bool hasCPUID() |
|---|
| | 65 | // GDC is a filthy liar. It can't actually do inline asm. |
|---|
| | 66 | } |
|---|
| | 67 | else version(D_InlineAsm_X86) |
|---|
| 625 | 68 | { |
|---|
| 626 | | uint flags; |
|---|
| 627 | | asm { |
|---|
| 628 | | pushfd; |
|---|
| 629 | | pop EAX; |
|---|
| 630 | | mov flags, EAX; |
|---|
| 631 | | xor EAX, 0x0020_0000; |
|---|
| 632 | | push EAX; |
|---|
| 633 | | popfd; |
|---|
| 634 | | pushfd; |
|---|
| 635 | | pop EAX; |
|---|
| 636 | | xor flags, EAX; |
|---|
| 637 | | } |
|---|
| 638 | | return (flags & 0x0020_0000) !=0; |
|---|
| 639 | | } |
|---|
| 640 | | |
|---|
| 641 | | } else { // inline asm X86 |
|---|
| 642 | | |
|---|
| 643 | | bool hasCPUID() { return false; } |
|---|
| 644 | | |
|---|
| 645 | | void cpuidX86() |
|---|
| 646 | | { |
|---|
| 647 | | datacache[0].size = 8; |
|---|
| 648 | | datacache[0].associativity = 2; |
|---|
| 649 | | datacache[0].lineSize = 32; |
|---|
| 650 | | } |
|---|
| | 69 | version = Really_D_InlineAsm_X86; |
|---|
| 651 | 70 | } |
|---|
| | 71 | |
|---|
| | 72 | public: |
|---|
| 652 | 73 | |
|---|
| 653 | | // TODO: Implement this function with OS support |
|---|
| 654 | | void cpuidPPC() |
|---|
| | 74 | private struct cpuid |
|---|
| 655 | 75 | { |
|---|
| 656 | | enum :int { PPC601, PPC603, PPC603E, PPC604, |
|---|
| 657 | | PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }; |
|---|
| | 76 | // If optimizing for a particular processor, it is generally better |
|---|
| | 77 | // to identify based on features rather than model. NOTE: Normally |
|---|
| | 78 | // it's only worthwhile to optimise for the latest Intel and AMD CPU, |
|---|
| | 79 | // with a backup for other CPUs. |
|---|
| | 80 | // Pentium -- preferPentium1() |
|---|
| | 81 | // PMMX -- + mmx() |
|---|
| | 82 | // PPro -- default |
|---|
| | 83 | // PII -- + mmx() |
|---|
| | 84 | // PIII -- + mmx() + sse() |
|---|
| | 85 | // PentiumM -- + mmx() + sse() + sse2() |
|---|
| | 86 | // Pentium4 -- preferPentium4() |
|---|
| | 87 | // PentiumD -- + isX86_64() |
|---|
| | 88 | // Core2 -- default + isX86_64() |
|---|
| | 89 | // AMD K5 -- preferPentium1() |
|---|
| | 90 | // AMD K6 -- + mmx() |
|---|
| | 91 | // AMD K6-II -- + mmx() + 3dnow() |
|---|
| | 92 | // AMD K7 -- preferAthlon() |
|---|
| | 93 | // AMD K8 -- + sse2() |
|---|
| | 94 | // AMD K10 -- + isX86_64() |
|---|
| | 95 | // Cyrix 6x86 -- preferPentium1() |
|---|
| | 96 | // 6x86MX -- + mmx() |
|---|
| | 97 | |
|---|
| | 98 | public: |
|---|
| | 99 | |
|---|
| | 100 | /// Cache size and behaviour |
|---|
| | 101 | struct CacheInfo |
|---|
| | 102 | { |
|---|
| | 103 | /// Size of the cache, in kilobytes, per CPU. |
|---|
| | 104 | /// For L1 unified (data + code) caches, this size is half the physical size. |
|---|
| | 105 | /// (we don't halve it for larger sizes, since normally |
|---|
| | 106 | /// data size is much greater than code size for critical loops). |
|---|
| | 107 | uint size; |
|---|
| | 108 | /// Number of ways of associativity, eg: |
|---|
| | 109 | /// 1 = direct mapped |
|---|
| | 110 | /// 2 = 2-way set associative |
|---|
| | 111 | /// 3 = 3-way set associative |
|---|
| | 112 | /// ubyte.max = fully associative |
|---|
| | 113 | ubyte associativity; |
|---|
| | 114 | /// Number of bytes read into the cache when a cache miss occurs. |
|---|
| | 115 | uint lineSize; |
|---|
| | 116 | } |
|---|
| | 117 | |
|---|
| | 118 | public: |
|---|
| | 119 | /// Returns vendor string, for display purposes only. |
|---|
| | 120 | /// Do NOT use this to determine features! |
|---|
| | 121 | /// Note that some CPUs have programmable vendorIDs. |
|---|
| | 122 | char[] vendor() {return vendorID;} |
|---|
| | 123 | /// Returns processor string, for display purposes only |
|---|
| | 124 | char[] processor() {return processorName;} |
|---|
| | 125 | |
|---|
| | 126 | /// The data caches. If there are fewer than 5 physical caches levels, |
|---|
| | 127 | /// the remaining levels are set to uint.max (== entire memory space) |
|---|
| | 128 | CacheInfo[5] datacache; |
|---|
| | 129 | /// Does it have an x87 FPU on-chip? |
|---|
| | 130 | bool x87onChip() {return (features&FPU_BIT)!=0;} |
|---|
| | 131 | /// Is MMX supported? |
|---|
| | 132 | bool mmx() {return (features&MMX_BIT)!=0;} |
|---|
| | 133 | /// Is SSE supported? |
|---|
| | 134 | bool sse() {return (features&SSE_BIT)!=0;} |
|---|
| | 135 | /// Is SSE2 supported? |
|---|
| | 136 | bool sse2() {return (features&SSE2_BIT)!=0;} |
|---|
| | 137 | /// Is SSE3 supported? |
|---|
| | 138 | bool sse3() {return (miscfeatures&SSE3_BIT)!=0;} |
|---|
| | 139 | /// Is SSSE3 supported? |
|---|
| | 140 | bool ssse3() {return (miscfeatures&SSSE3_BIT)!=0;} |
|---|
| | 141 | /// Is SSE4.1 supported? |
|---|
| | 142 | bool sse41() {return (miscfeatures&SSE41_BIT)!=0;} |
|---|
| | 143 | /// Is SSE4.2 supported? |
|---|
| | 144 | bool sse42() {return (miscfeatures&SSE42_BIT)!=0;} |
|---|
| | 145 | /// Is SSE4a supported? |
|---|
| | 146 | bool sse4a() {return (amdmiscfeatures&SSE4A_BIT)!=0;} |
|---|
| | 147 | /// Is SSE5 supported? |
|---|
| | 148 | bool sse5() {return (amdmiscfeatures&SSE5_BIT)!=0;} |
|---|
| | 149 | /// Is AMD 3DNOW supported? |
|---|
| | 150 | bool amd3dnow() {return (amdfeatures&AMD_3DNOW_BIT)!=0;} |
|---|
| | 151 | /// Is AMD 3DNOW Ext supported? |
|---|
| | 152 | bool amd3dnowExt() {return (amdfeatures&AMD_3DNOW_EXT_BIT)!=0;} |
|---|
| | 153 | /// Are AMD extensions to MMX supported? |
|---|
| | 154 | bool amdMmx() {return (amdfeatures&AMD_MMX_BIT)!=0;} |
|---|
| | 155 | /// Is fxsave/fxrstor supported? |
|---|
| | 156 | bool hasFxsr() {return (features&FXSR_BIT)!=0;} |
|---|
| | 157 | /// Is cmov supported? |
|---|
| | 158 | bool hasCmov() {return (features&CMOV_BIT)!=0;} |
|---|
| | 159 | /// Is rdtsc supported? |
|---|
| | 160 | bool hasRdtsc() {return (features&TIMESTAMP_BIT)!=0;} |
|---|
| | 161 | /// Is cmpxchg8b supported? |
|---|
| | 162 | bool hasCmpxchg8b() {return (features&CMPXCHG8B_BIT)!=0;} |
|---|
| | 163 | /// Is cmpxchg8b supported? |
|---|
| | 164 | bool hasCmpxchg16b() {return (miscfeatures&CMPXCHG16B_BIT)!=0;} |
|---|
| | 165 | /// Is 3DNow prefetch supported? |
|---|
| | 166 | bool has3dnowPrefetch() |
|---|
| | 167 | {return (amdmiscfeatures&AMD_3DNOW_PREFETCH_BIT)!=0;} |
|---|
| | 168 | /// Are LAHF and SAHF supported in 64-bit mode? |
|---|
| | 169 | bool hasLahfSahf() {return (amdmiscfeatures&LAHFSAHF_BIT)!=0;} |
|---|
| | 170 | /// Is POPCNT supported? |
|---|
| | 171 | bool hasPopcnt() {return (miscfeatures&POPCNT_BIT)!=0;} |
|---|
| | 172 | /// Is LZCNT supported? |
|---|
| | 173 | bool hasLzcnt() {return (amdmiscfeatures&LZCNT_BIT)!=0;} |
|---|
| | 174 | /// Is this an Intel64 or AMD 64? |
|---|
| | 175 | bool isX86_64() {return (amdfeatures&AMD64_BIT)!=0;} |
|---|
| | 176 | |
|---|
| | 177 | /// Is this an IA64 (Itanium) processor? |
|---|
| | 178 | bool isItanium() { return (features&IA64_BIT)!=0; } |
|---|
| | 179 | |
|---|
| | 180 | /// Is hyperthreading supported? |
|---|
| | 181 | bool hyperThreading() { return maxThreads>maxCores; } |
|---|
| | 182 | /// Returns number of threads per CPU |
|---|
| | 183 | uint threadsPerCPU() {return maxThreads;} |
|---|
| | 184 | /// Returns number of cores in CPU |
|---|
| | 185 | uint coresPerCPU() {return maxCores;} |
|---|
| | 186 | |
|---|
| | 187 | /// Optimisation hints for assembly code. |
|---|
| | 188 | /// For forward compatibility, the CPU is compared against different |
|---|
| | 189 | /// microarchitectures. For 32-bit X86, comparisons are made against |
|---|
| | 190 | /// the Intel PPro/PII/PIII/PM family. |
|---|
| | 191 | /// |
|---|
| | 192 | /// The major 32-bit x86 microarchitecture 'dynasties' have been: |
|---|
| | 193 | /// (1) Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2). |
|---|
| | 194 | /// (2) AMD Athlon (K7, K8, K10). |
|---|
| | 195 | /// (3) Intel NetBurst (Pentium 4, Pentium D). |
|---|
| | 196 | /// (4) In-order Pentium (Pentium1, PMMX) |
|---|
| | 197 | /// Other early CPUs (Nx586, AMD K5, K6, Centaur C3, Transmeta, |
|---|
| | 198 | /// Cyrix, Rise) were mostly in-order. |
|---|
| | 199 | /// Some new processors do not fit into the existing categories: |
|---|
| | 200 | /// Intel Atom 230/330 (family 6, model 0x1C) is an in-order core. |
|---|
| | 201 | /// Centaur Isiah = VIA Nano (family 6, model F) is an out-of-order core. |
|---|
| | 202 | /// |
|---|
| | 203 | /// Within each dynasty, the optimisation techniques are largely |
|---|
| | 204 | /// identical (eg, use instruction pairing for group 4). Major |
|---|
| | 205 | /// instruction set improvements occur within each group. |
|---|
| | 206 | |
|---|
| | 207 | /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code? |
|---|
| | 208 | bool preferAthlon() { return probablyAMD && family >=6; } |
|---|
| | 209 | /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code? |
|---|
| | 210 | bool preferPentium4() { return probablyIntel && family == 0xF; } |
|---|
| | 211 | /// Does this CPU perform better on Pentium I code than Pentium Pro code? |
|---|
| | 212 | bool preferPentium1() { return family < 6 || (family==6 && model < 0xF && !probablyIntel); } |
|---|
| | 213 | |
|---|
| | 214 | public: |
|---|
| | 215 | /// Processor type (vendor-dependent). |
|---|
| | 216 | /// This should be visible ONLY for display purposes. |
|---|
| | 217 | uint stepping, model, family; |
|---|
| | 218 | uint numCacheLevels = 1; |
|---|
| | 219 | private: |
|---|
| | 220 | bool probablyIntel; // true = _probably_ an Intel processor, might be faking |
|---|
| | 221 | bool probablyAMD; // true = _probably_ an AMD processor |
|---|
| | 222 | char [12] vendorID; |
|---|
| | 223 | char [] processorName; |
|---|
| | 224 | char [48] processorNameBuffer; |
|---|
| | 225 | uint features = 0; // mmx, sse, sse2, hyperthreading, etc |
|---|
| | 226 | uint miscfeatures = 0; // sse3, etc. |
|---|
| | 227 | uint amdfeatures = 0; // 3DNow!, mmxext, etc |
|---|
| | 228 | uint amdmiscfeatures = 0; // sse4a, sse5, svm, etc |
|---|
| | 229 | uint maxCores = 1; |
|---|
| | 230 | uint maxThreads = 1; |
|---|
| | 231 | // Note that this may indicate multi-core rather than hyperthreading. |
|---|
| | 232 | bool hyperThreadingBit() { return (features&HTT_BIT)!=0;} |
|---|
| | 233 | |
|---|
| | 234 | // feature flags CPUID1_EDX |
|---|
| | 235 | enum : uint |
|---|
| | 236 | { |
|---|
| | 237 | FPU_BIT = 1, |
|---|
| | 238 | TIMESTAMP_BIT = 1<<4, // rdtsc |
|---|
| | 239 | MDSR_BIT = 1<<5, // RDMSR/WRMSR |
|---|
| | 240 | CMPXCHG8B_BIT = 1<<8, |
|---|
| | 241 | CMOV_BIT = 1<<15, |
|---|
| | 242 | MMX_BIT = 1<<23, |
|---|
| | 243 | FXSR_BIT = 1<<24, |
|---|
| | 244 | SSE_BIT = 1<<25, |
|---|
| | 245 | SSE2_BIT = 1<<26, |
|---|
| | 246 | HTT_BIT = 1<<28, |
|---|
| | 247 | IA64_BIT = 1<<30 |
|---|
| | 248 | } |
|---|
| | 249 | // feature flags misc CPUID1_ECX |
|---|
| | 250 | enum : uint |
|---|
| | 251 | { |
|---|
| | 252 | SSE3_BIT = 1, |
|---|
| | 253 | PCLMULQDQ_BIT = 1<<1, // from AVX |
|---|
| | 254 | MWAIT_BIT = 1<<3, |
|---|
| | 255 | SSSE3_BIT = 1<<9, |
|---|
| | 256 | FMA_BIT = 1<<12, // from AVX |
|---|
| | 257 | CMPXCHG16B_BIT = 1<<13, |
|---|
| | 258 | SSE41_BIT = 1<<19, |
|---|
| | 259 | SSE42_BIT = 1<<20, |
|---|
| | 260 | POPCNT_BIT = 1<<23, |
|---|
| | 261 | AES_BIT = 1<<25, // AES instructions from AVX |
|---|
| | 262 | OSXSAVE_BIT = 1<<27, // Used for AVX |
|---|
| | 263 | AVX_BIT = 1<<28 |
|---|
| | 264 | } |
|---|
| | 265 | /+ |
|---|
| | 266 | version(X86_64) { |
|---|
| | 267 | bool hasAVXinHardware() { |
|---|
| | 268 | // This only indicates hardware support, not OS support. |
|---|
| | 269 | return (miscfeatures&AVX_BIT) && (miscfeatures&OSXSAVE_BIT); |
|---|
| | 270 | } |
|---|
| | 271 | // Is AVX supported (in both hardware & OS)? |
|---|
| | 272 | bool Avx() { |
|---|
| | 273 | if (!hasAVXinHardware()) return false; |
|---|
| | 274 | // Check for OS support |
|---|
| | 275 | uint xfeatures; |
|---|
| | 276 | asm {mov ECX, 0; xgetbv; mov xfeatures, EAX; } |
|---|
| | 277 | return (xfeatures&0x6)==6; |
|---|
| | 278 | } |
|---|
| | 279 | bool hasAvxFma() { |
|---|
| | 280 | if (!AVX()) return false; |
|---|
| | 281 | return (features&FMA_BIT)!=0; |
|---|
| | 282 | } |
|---|
| | 283 | } |
|---|
| | 284 | +/ |
|---|
| | 285 | // AMD feature flags CPUID80000001_EDX |
|---|
| | 286 | enum : uint |
|---|
| | 287 | { |
|---|
| | 288 | AMD_MMX_BIT = 1<<22, |
|---|
| | 289 | // FXR_OR_CYRIXMMX_BIT = 1<<24, // Cyrix/NS: 6x86MMX instructions. |
|---|
| | 290 | FFXSR_BIT = 1<<25, |
|---|
| | 291 | PAGE1GB_BIT = 1<<26, // support for 1GB pages |
|---|
| | 292 | RDTSCP_BIT = 1<<27, |
|---|
| | 293 | AMD64_BIT = 1<<29, |
|---|
| | 294 | AMD_3DNOW_EXT_BIT = 1<<30, |
|---|
| | 295 | AMD_3DNOW_BIT = 1<<31 |
|---|
| | 296 | } |
|---|
| | 297 | // AMD misc feature flags CPUID80000001_ECX |
|---|
| | 298 | enum : uint |
|---|
| | 299 | { |
|---|
| | 300 | LAHFSAHF_BIT = 1, |
|---|
| | 301 | LZCNT_BIT = 1<<5, |
|---|
| | 302 | SSE4A_BIT = 1<<6, |
|---|
| | 303 | AMD_3DNOW_PREFETCH_BIT = 1<<8, |
|---|
| | 304 | SSE5_BIT = 1<<11 |
|---|
| | 305 | } |
|---|
| | 306 | |
|---|
| | 307 | |
|---|
| | 308 | |
|---|
| | 309 | version(Really_D_InlineAsm_X86) |
|---|
| | 310 | { |
|---|
| | 311 | // Note that this code will also work for Itanium, after changing the |
|---|
| | 312 | // register names in the asm code. |
|---|
| | 313 | |
|---|
| | 314 | uint max_cpuid, max_extended_cpuid; |
|---|
| | 315 | |
|---|
| | 316 | // CPUID2: "cache and tlb information" |
|---|
| | 317 | void getcacheinfoCPUID2() |
|---|
| | 318 | { |
|---|
| | 319 | // CPUID2 is a dog's breakfast. What was Intel thinking??? |
|---|
| | 320 | // We are only interested in the data caches |
|---|
| | 321 | void decipherCpuid2(ubyte x) { |
|---|
| | 322 | if (x==0) return; |
|---|
| | 323 | // Values from http://www.sandpile.org/ia32/cpuid.htm. |
|---|
| | 324 | // Includes Itanium and non-Intel CPUs. |
|---|
| | 325 | // |
|---|
| | 326 | ubyte [] ids = [ |
|---|
| | 327 | 0x0A, 0x0C, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68, |
|---|
| | 328 | // level 2 cache |
|---|
| | 329 | 0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F, |
|---|
| | 330 | 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E, |
|---|
| | 331 | 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81, |
|---|
| | 332 | // level 3 cache |
|---|
| | 333 | 0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D |
|---|
| | 334 | ]; |
|---|
| | 335 | uint [] sizes = [ |
|---|
| | 336 | 8, 16, 32, 16, 24, 8, 16, 32, |
|---|
| | 337 | 128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512, |
|---|
| | 338 | 256, 512, 1024, 2048, 512, 1024, 4096, 6*1024, |
|---|
| | 339 | 128, 192, 128, 256, 384, 512, 3072, 512, 128, |
|---|
| | 340 | 512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024 |
|---|
| | 341 | ]; |
|---|
| | 342 | // CPUBUG: Pentium M reports 0x2C but tests show it is only 4-way associative |
|---|
| | 343 | ubyte [] ways = [ |
|---|
| | 344 | 2, 4, 8, 8, 6, 4, 4, 4, |
|---|
| | 345 | 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2, |
|---|
| | 346 | 8, 8, 8, 8, 4, 8, 16, 24, |
|---|
| | 347 | 4, 6, 2, 4, 6, 4, 12, 8, 8, |
|---|
| | 348 | 4, 8, 8, 8, 4, 8, 12, 16, 12, 16 |
|---|
| | 349 | ]; |
|---|
| | 350 | enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 } |
|---|
| | 351 | for (int i=0; i< ids.length; ++i) { |
|---|
| | 352 | if (x==ids[i]) { |
|---|
| | 353 | int level = i< FIRSTDATA2 ? 0: i<FIRSTDATA3 ? 1 : 2; |
|---|
| | 354 | if (x==0x49 && family==0xF && model==0x6) level=2; |
|---|
| | 355 | datacache[level].size=sizes[i]; |
|---|
| | 356 | datacache[level].associativity=ways[i]; |
|---|
| | 357 | if (level == 3 || x==0x2C || (x>=0x48 && x<=0x80) |
|---|
| | 358 | || x==0x86 || x==0x87 |
|---|
| | 359 | || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E) ){ |
|---|
| | 360 | datacache[level].lineSize = 64; |
|---|
| | 361 | } else datacache[level].lineSize = 32; |
|---|
| | 362 | } |
|---|
| | 363 | } |
|---|
| | 364 | } |
|---|
| | 365 | |
|---|
| | 366 | uint[4] a; |
|---|
| | 367 | bool firstTime = true; |
|---|
| | 368 | // On a multi-core system, this could theoretically fail, but it's only used |
|---|
| | 369 | // for old single-core CPUs. |
|---|
| | 370 | uint numinfos = 1; |
|---|
| | 371 | do { |
|---|
| | 372 | asm { |
|---|
| | 373 | mov EAX, 2; |
|---|
| | 374 | cpuid; |
|---|
| | 375 | mov a, EAX; |
|---|
| | 376 | mov a+4, EBX; |
|---|
| | 377 | mov a+8, ECX; |
|---|
| | 378 | mov a+12, EDX; |
|---|
| | 379 | } |
|---|
| | 380 | if (firstTime) { |
|---|
| | 381 | if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) { |
|---|
| | 382 | // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080. |
|---|
| | 383 | // These are NOT standard Intel values |
|---|
| | 384 | // (TLB = 32 entry, 4 way associative, 4K pages) |
|---|
| | 385 | // (L1 cache = 16K, 4way, linesize16) |
|---|
| | 386 | datacache[0].size=8; |
|---|
| | 387 | datacache[0].associativity=4; |
|---|
| | 388 | datacache[0].lineSize=16; |
|---|
| | 389 | return; |
|---|
| | 390 | } |
|---|
| | 391 | // lsb of a is how many times to loop. |
|---|
| | 392 | numinfos = a[0] & 0xFF; |
|---|
| | 393 | // and otherwise it should be ignored |
|---|
| | 394 | a[0] &= 0xFFFF_FF00; |
|---|
| | 395 | firstTime = false; |
|---|
| | 396 | } |
|---|
| | 397 | for (int c=0; c<4;++c) { |
|---|
| | 398 | // high bit set == no info. |
|---|
| | 399 | if (a[c] & 0x8000_0000) continue; |
|---|
| | 400 | decipherCpuid2(cast(ubyte)(a[c] & 0xFF)); |
|---|
| | 401 | decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF)); |
|---|
| | 402 | decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF)); |
|---|
| | 403 | decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF)); |
|---|
| | 404 | } |
|---|
| | 405 | } while (--numinfos); |
|---|
| | 406 | } |
|---|
| | 407 | |
|---|
| | 408 | // CPUID4: "Deterministic cache parameters" leaf |
|---|
| | 409 | void getcacheinfoCPUID4() |
|---|
| | 410 | { |
|---|
| | 411 | int cachenum = 0; |
|---|
| | 412 | for(;;) { |
|---|
| | 413 | uint a, b, number_of_sets; |
|---|
| | 414 | asm { |
|---|
| | 415 | mov EAX, 4; |
|---|
| | 416 | mov ECX, cachenum; |
|---|
| | 417 | cpuid; |
|---|
| | 418 | mov a, EAX; |
|---|
| | 419 | mov b, EBX; |
|---|
| | 420 | mov number_of_sets, ECX; |
|---|
| | 421 | } |
|---|
| | 422 | ++cachenum; |
|---|
| | 423 | if ((a&0x1F)==0) break; // no more caches |
|---|
| | 424 | uint numthreads = ((a>>14) & 0xFFF) + 1; |
|---|
| | 425 | uint numcores = ((a>>26) & 0x3F) + 1; |
|---|
| | 426 | if (numcores > maxCores) maxCores = numcores; |
|---|
| | 427 | if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches |
|---|
| | 428 | |
|---|
| | 429 | ++number_of_sets; |
|---|
| | 430 | ubyte level = cast(ubyte)(((a>>5)&7)-1); |
|---|
| | 431 | if (level > datacache.length) continue; // ignore deep caches |
|---|
| | 432 | datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1); |
|---|
| | 433 | datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size |
|---|
| | 434 | uint line_partitions = ((b >> 12)& 0x3FF) + 1; |
|---|
| | 435 | // Size = number of sets * associativity * cachelinesize * linepartitions |
|---|
| | 436 | // and must convert to Kb, also dividing by the number of cores. |
|---|
| | 437 | ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets * |
|---|
| | 438 | datacache[level].associativity : number_of_sets; |
|---|
| | 439 | datacache[level].size = cast(uint)( |
|---|
| | 440 | (sz * datacache[level].lineSize * line_partitions ) / (numcores *1024)); |
|---|
| | 441 | if (level == 0 && (a&0xF)==3) { |
|---|
| | 442 | // Halve the size for unified L1 caches |
|---|
| | 443 | datacache[level].size/=2; |
|---|
| | 444 | } |
|---|
| | 445 | } |
|---|
| | 446 | } |
|---|
| | 447 | |
|---|
| | 448 | // CPUID8000_0005 & 6 |
|---|
| | 449 | void getAMDcacheinfo() |
|---|
| | 450 | { |
|---|
| | 451 | uint c5, c6, d6; |
|---|
| | 452 | asm { |
|---|
| | 453 | mov EAX, 0x8000_0005; // L1 cache |
|---|
| | 454 | cpuid; |
|---|
| | 455 | // EAX has L1_TLB_4M. |
|---|
| | 456 | // EBX has L1_TLB_4K |
|---|
| | 457 | // EDX has L1 instruction cache |
|---|
| | 458 | mov c5, ECX; |
|---|
| | 459 | } |
|---|
| | 460 | |
|---|
| | 461 | datacache[0].size = ( (c5>>24) & 0xFF); |
|---|
| | 462 | datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF); |
|---|
| | 463 | datacache[0].lineSize = c5 & 0xFF; |
|---|
| | 464 | |
|---|
| | 465 | if (max_extended_cpuid >= 0x8000_0006) { |
|---|
| | 466 | // AMD K6-III or K6-2+ or later. |
|---|
| | 467 | ubyte numcores = 1; |
|---|
| | 468 | if (max_extended_cpuid >=0x8000_0008) { |
|---|
| | 469 | asm { |
|---|
| | 470 | mov EAX, 0x8000_0008; |
|---|
| | 471 | cpuid; |
|---|
| | 472 | mov numcores, CL; |
|---|
| | 473 | } |
|---|
| | 474 | ++numcores; |
|---|
| | 475 | if (numcores>maxCores) maxCores = numcores; |
|---|
| | 476 | } |
|---|
| | 477 | asm { |
|---|
| | 478 | mov EAX, 0x8000_0006; // L2/L3 cache |
|---|
| | 479 | cpuid; |
|---|
| | 480 | mov c6, ECX; // L2 cache info |
|---|
| | 481 | mov d6, EDX; // L3 cache info |
|---|
| | 482 | } |
|---|
| | 483 | |
|---|
| | 484 | ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ]; |
|---|
| | 485 | datacache[1].size = (c6>>16) & 0xFFFF; |
|---|
| | 486 | datacache[1].associativity = assocmap[(c6>>12)&0xF]; |
|---|
| | 487 | datacache[1].lineSize = c6 & 0xFF; |
|---|
| | 488 | |
|---|
| | 489 | // The L3 cache value is TOTAL, not per core. |
|---|
| | 490 | datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1. |
|---|
| | 491 | datacache[2].associativity = assocmap[(d6>>12)&0xF]; |
|---|
| | 492 | datacache[2].lineSize = d6 & 0xFF; |
|---|
| | 493 | } |
|---|
| | 494 | } |
|---|
| | 495 | |
|---|
| | 496 | |
|---|
| | 497 | void cpuidX86() |
|---|
| | 498 | { |
|---|
| | 499 | char * venptr = vendorID.ptr; |
|---|
| | 500 | void* buff = &this.max_cpuid; |
|---|
| | 501 | void* buff2 = &this.max_extended_cpuid; |
|---|
| | 502 | |
|---|
| | 503 | asm |
|---|
| | 504 | { |
|---|
| | 505 | mov EAX, 0; |
|---|
| | 506 | cpuid; |
|---|
| | 507 | mov [buff], EAX; |
|---|
| | 508 | mov EAX, venptr; |
|---|
| | 509 | mov [EAX], EBX; |
|---|
| | 510 | mov [EAX + 4], EDX; |
|---|
| | 511 | mov [EAX + 8], ECX; |
|---|
| | 512 | mov EAX, 0x8000_0000; |
|---|
| | 513 | cpuid; |
|---|
| | 514 | mov [buff2], EAX; |
|---|
| | 515 | } |
|---|
| | 516 | |
|---|
| | 517 | buff = &this.probablyIntel; |
|---|
| | 518 | buff2 = &this.probablyAMD; |
|---|
| | 519 | |
|---|
| | 520 | |
|---|
| | 521 | this.probablyIntel = !memcmp(vendorID.ptr, cast(char*)"GenuineIntel", 12); |
|---|
| | 522 | this.probablyAMD = !memcmp(vendorID.ptr, cast(char*)"AuthenticAMD", 12); |
|---|
| | 523 | |
|---|
| | 524 | uint a, b, c, d; |
|---|
| | 525 | uint apic = 0; // brand index, apic id |
|---|
| | 526 | |
|---|
| | 527 | buff = &this.miscfeatures; |
|---|
| | 528 | buff2 = &this.features; |
|---|
| | 529 | |
|---|
| | 530 | asm { |
|---|
| | 531 | mov EAX, 1; // model, stepping |
|---|
| | 532 | cpuid; |
|---|
| | 533 | mov a, EAX; |
|---|
| | 534 | mov apic, EBX; |
|---|
| | 535 | mov [buff], ECX; |
|---|
| | 536 | mov [buff2], EDX; |
|---|
| | 537 | } |
|---|
| | 538 | amdfeatures = 0; |
|---|
| | 539 | amdmiscfeatures = 0; |
|---|
| | 540 | if (max_extended_cpuid >= 0x8000_0001) { |
|---|
| | 541 | buff = &this.amdmiscfeatures; |
|---|
| | 542 | buff2 = &this.amdfeatures; |
|---|
| | 543 | asm { |
|---|
| | 544 | mov EAX, 0x8000_0001; |
|---|
| | 545 | cpuid; |
|---|
| | 546 | mov [buff], ECX; |
|---|
| | 547 | mov [buff2], EDX; |
|---|
| | 548 | } |
|---|
| | 549 | } |
|---|
| | 550 | // Try to detect fraudulent vendorIDs |
|---|
| | 551 | if (amd3dnow) probablyIntel = false; |
|---|
| | 552 | |
|---|
| | 553 | stepping = a & 0xF; |
|---|
| | 554 | uint fbase = (a >> 8) & 0xF; |
|---|
| | 555 | uint mbase = (a >> 4) & 0xF; |
|---|
| | 556 | family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase; |
|---|
| | 557 | model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ? |
|---|
| | 558 | mbase + ((a >> 12) & 0xF0) : mbase; |
|---|
| | 559 | |
|---|
| | 560 | if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) { |
|---|
| | 561 | // determine max number of cores for AMD |
|---|
| | 562 | asm { |
|---|
| | 563 | mov EAX, 0x8000_0008; |
|---|
| | 564 | cpuid; |
|---|
| | 565 | mov c, ECX; |
|---|
| | 566 | } |
|---|
| | 567 | uint apicsize = (c>>12) & 0xF; |
|---|
| | 568 | if (apicsize == 0) { |
|---|
| | 569 | // use legacy method |
|---|
| | 570 | if (hyperThreadingBit) maxCores = c & 0xFF; |
|---|
| | 571 | else maxCores = 1; |
|---|
| | 572 | } else { |
|---|
| | 573 | // maxcores = 2^ apicsize |
|---|
| | 574 | maxCores = 1; |
|---|
| | 575 | while (apicsize) { maxCores<<=1; --apicsize; } |
|---|
| | 576 | } |
|---|
| | 577 | } |
|---|
| | 578 | |
|---|
| | 579 | if (max_extended_cpuid >= 0x8000_0004) { |
|---|
| | 580 | char *procptr = processorNameBuffer.ptr; |
|---|
| | 581 | asm { |
|---|
| | 582 | push ESI; |
|---|
| | 583 | mov ESI, procptr; |
|---|
| | 584 | mov EAX, 0x8000_0002; |
|---|
| | 585 | cpuid; |
|---|
| | 586 | mov [ESI], EAX; |
|---|
| | 587 | mov [ESI+4], EBX; |
|---|
| | 588 | mov [ESI+8], ECX; |
|---|
| | 589 | mov [ESI+12], EDX; |
|---|
| | 590 | mov EAX, 0x8000_0003; |
|---|
| | 591 | cpuid; |
|---|
| | 592 | mov [ESI+16], EAX; |
|---|
| | 593 | mov [ESI+20], EBX; |
|---|
| | 594 | mov [ESI+24], ECX; |
|---|
| | 595 | mov [ESI+28], EDX; |
|---|
| | 596 | mov EAX, 0x8000_0004; |
|---|
| | 597 | cpuid; |
|---|
| | 598 | mov [ESI+32], EAX; |
|---|
| | 599 | mov [ESI+36], EBX; |
|---|
| | 600 | mov [ESI+40], ECX; |
|---|
| | 601 | mov [ESI+44], EDX; |
|---|
| | 602 | pop ESI; |
|---|
| | 603 | } |
|---|
| | 604 | // Intel P4 and PM pad at front with spaces. |
|---|
| | 605 | // Other CPUs pad at end with nulls. |
|---|
| | 606 | int start = 0, end = 0; |
|---|
| | 607 | while (processorNameBuffer[start] == ' ') { ++start; } |
|---|
| | 608 | while (processorNameBuffer[$-end-1] == 0) { ++end; } |
|---|
| | 609 | processorName = processorNameBuffer[start..$-end]; |
|---|
| | 610 | } else { |
|---|
| | 611 | processorName = "Unknown CPU"; |
|---|
| | 612 | } |
|---|
| | 613 | // Determine cache sizes |
|---|
| | 614 | |
|---|
| | 615 | // Intel docs specify that they return 0 for 0x8000_0005. |
|---|
| | 616 | // AMD docs do not specify the behaviour for 0004 and 0002. |
|---|
| | 617 | // Centaur/VIA and most other manufacturers use the AMD method, |
|---|
| | 618 | // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2! |
|---|
| | 619 | // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour |
|---|
| | 620 | // for CPUID80000005. But Geode GX uses the AMD method |
|---|
| | 621 | |
|---|
| | 622 | // Deal with idiotic Geode GX1 - make it same as MediaGX MMX. |
|---|
| | 623 | if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) { |
|---|
| | 624 | max_extended_cpuid = 0x8000_0004; |
|---|
| | 625 | } |
|---|
| | 626 | // Therefore, we try the AMD method unless it's an Intel chip. |
|---|
| | 627 | // If we still have no info, try the Intel methods. |
|---|
| | 628 | datacache[0].size = 0; |
|---|
| | 629 | if (max_cpuid<2 || !probablyIntel) { |
|---|
| | 630 | if (max_extended_cpuid >= 0x8000_0005) { |
|---|
| | 631 | getAMDcacheinfo(); |
|---|
| | 632 | } else if (probablyAMD) { |
|---|
| | 633 | // According to AMDProcRecognitionAppNote, this means CPU |
|---|
| | 634 | // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4) |
|---|
| | 635 | // Am5x86 has 16Kb 4-way unified data & code cache. |
|---|
| | 636 | datacache[0].size = 8; |
|---|
| | 637 | datacache[0].associativity = 4; |
|---|
| | 638 | datacache[0].lineSize = 32; |
|---|
| | 639 | } else { |
|---|
| | 640 | // Some obscure CPU. |
|---|
| | 641 | // Values for Cyrix 6x86MX (family 6, model 0) |
|---|
| | 642 | datacache[0].size = 64; |
|---|
| | 643 | datacache[0].associativity = 4; |
|---|
| | 644 | datacache[0].lineSize = 32; |
|---|
| | 645 | } |
|---|
| | 646 | } |
|---|
| | 647 | if ((datacache[0].size == 0) && max_cpuid>=4) { |
|---|
| | 648 | getcacheinfoCPUID4(); |
|---|
| | 649 | } |
|---|
| | 650 | if ((datacache[0].size == 0) && max_cpuid>=2) { |
|---|
| | 651 | getcacheinfoCPUID2(); |
|---|
| | 652 | } |
|---|
| | 653 | if (datacache[0].size == 0) { |
|---|
| | 654 | // Pentium, PMMX, late model 486, or an obscure CPU |
|---|
| | 655 | if (mmx) { // Pentium MMX. Also has 8kB code cache. |
|---|
| | 656 | datacache[0].size = 16; |
|---|
| | 657 | datacache[0].associativity = 4; |
|---|
| | 658 | datacache[0].lineSize = 32; |
|---|
| | 659 | } else { // Pentium 1 (which also has 8kB code cache) |
|---|
| | 660 | // or 486. |
|---|
| | 661 | // Cyrix 6x86: 16, 4way, 32 linesize |
|---|
| | 662 | datacache[0].size = 8; |
|---|
| | 663 | datacache[0].associativity = 2; |
|---|
| | 664 | datacache[0].lineSize = 32; |
|---|
| | 665 | } |
|---|
| | 666 | } |
|---|
| | 667 | if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF; |
|---|
| | 668 | else maxThreads = maxCores; |
|---|
| | 669 | } |
|---|
| | 670 | |
|---|
| | 671 | // Return true if the cpuid instruction is supported. |
|---|
| | 672 | // BUG(WONTFIX): Doesn't work for Cyrix 6x86 and 6x86L. |
|---|
| | 673 | bool hasCPUID() |
|---|
| | 674 | { |
|---|
| | 675 | uint flags; |
|---|
| | 676 | asm { |
|---|
| | 677 | pushfd; |
|---|
| | 678 | pop EAX; |
|---|
| | 679 | mov flags, EAX; |
|---|
| | 680 | xor EAX, 0x0020_0000; |
|---|
| | 681 | push EAX; |
|---|
| | 682 | popfd; |
|---|
| | 683 | pushfd; |
|---|
| | 684 | pop EAX; |
|---|
| | 685 | xor flags, EAX; |
|---|
| | 686 | } |
|---|
| | 687 | return (flags & 0x0020_0000) !=0; |
|---|
| | 688 | } |
|---|
| | 689 | |
|---|
| | 690 | } else { // inline asm X86 |
|---|
| | 691 | |
|---|
| | 692 | bool hasCPUID() { return false; } |
|---|
| | 693 | |
|---|
| | 694 | void cpuidX86() |
|---|
| | 695 | { |
|---|
| | 696 | datacache[0].size = 8; |
|---|
| | 697 | datacache[0].associativity = 2; |
|---|
| | 698 | datacache[0].lineSize = 32; |
|---|
| | 699 | } |
|---|
| | 700 | } |
|---|
| | 701 | |
|---|
| | 702 | // TODO: Implement this function with OS support |
|---|
| | 703 | void cpuidPPC() |
|---|
| | 704 | { |
|---|
| | 705 | enum :int { PPC601, PPC603, PPC603E, PPC604, |
|---|
| | 706 | PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }; |
|---|
| | 707 | |
|---|
| | 708 | // TODO: |
|---|
| | 709 | // asm { mfpvr; } returns the CPU version but unfortunately it can |
|---|
| | 710 | // only be used in kernel mode. So OS support is required. |
|---|
| | 711 | int cputype = PPC603; |
|---|
| | 712 | |
|---|
| | 713 | // 601 has a 8KB combined data & code L1 cache. |
|---|
| | 714 | uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64]; |
|---|
| | 715 | ubyte ways[] = [8, 2, 4, 4, 4, 8, 8, 8, 8]; |
|---|
| | 716 | uint L2size[]= [0, 0, 0, 0, 0, 0, 0, 256, 512]; |
|---|
| | 717 | uint L3size[]= [0, 0, 0, 0, 0, 0, 0, 2048, 0]; |
|---|
| | 718 | |
|---|
| | 719 | datacache[0].size = sizes[cputype]; |
|---|
| | 720 | datacache[0].associativity = ways[cputype]; |
|---|
| | 721 | datacache[0].lineSize = (cputype==PPCG5)? 128 : |
|---|
| | 722 | (cputype == PPC620 || cputype == PPCG3)? 64 : 32; |
|---|
| | 723 | datacache[1].size = L2size[cputype]; |
|---|
| | 724 | datacache[2].size = L3size[cputype]; |
|---|
| | 725 | datacache[1].lineSize = datacache[0].lineSize; |
|---|
| | 726 | datacache[2].lineSize = datacache[0].lineSize; |
|---|
| | 727 | } |
|---|
| | 728 | |
|---|
| | 729 | // TODO: Implement this function with OS support |
|---|
| | 730 | void cpuidSparc() |
|---|
| | 731 | { |
|---|
| | 732 | // UltaSparcIIi : L1 = 16, 2way. L2 = 512, 4 way. |
|---|
| | 733 | // UltraSparcIII : L1 = 64, 4way. L2= 4096 or 8192. |
|---|
| | 734 | // UltraSparcIIIi: L1 = 64, 4way. L2= 1024, 4 way |
|---|
| | 735 | // UltraSparcIV : L1 = 64, 4way. L2 = 16*1024. |
|---|
| | 736 | // UltraSparcIV+ : L1 = 64, 4way. L2 = 2048, L3=32*1024. |
|---|
| | 737 | // Sparc64V : L1 = 128, 2way. L2 = 4096 4way. |
|---|
| | 738 | } |
|---|
| | 739 | |
|---|
| | 740 | private void Init() |
|---|
| | 741 | { |
|---|
| | 742 | if (hasCPUID()) { |
|---|
| | 743 | cpuidX86(); |
|---|
| | 744 | } else { |
|---|
| | 745 | // it's a 386 or 486, or a Cyrix 6x86. |
|---|
| | 746 | //Probably still has an external cache. |
|---|
| | 747 | } |
|---|
| | 748 | if (datacache[0].size==0) { |
|---|
| | 749 | // Guess same as Pentium 1. |
|---|
| | 750 | datacache[0].size = 8; |
|---|
| | 751 | datacache[0].associativity = 2; |
|---|
| | 752 | datacache[0].lineSize = 32; |
|---|
| | 753 | } |
|---|
| | 754 | numCacheLevels = 1; |
|---|
| | 755 | // And now fill up all the unused levels with full memory space. |
|---|
| | 756 | for (int i=1; i< datacache.length; ++i) { |
|---|
| | 757 | if (datacache[i].size==0) { |
|---|
| | 758 | // Set all remaining levels of cache equal to full address space. |
|---|
| | 759 | datacache[i].size = uint.max/1024; |
|---|
| | 760 | datacache[i].associativity = 1; |
|---|
| | 761 | datacache[i].lineSize = datacache[i-1].lineSize; |
|---|
| | 762 | } else numCacheLevels = i+1; |
|---|
| | 763 | } |
|---|
| | 764 | } |
|---|
| | 765 | |
|---|
| 658 | 766 | |
|---|
| 659 | | // TODO: |
|---|
| 660 | | // asm { mfpvr; } returns the CPU version but unfortunately it can |
|---|
| 661 | | // only be used in kernel mode. So OS support is required. |
|---|
| 662 | | int cputype = PPC603; |
|---|
| 663 | | |
|---|
| 664 | | // 601 has a 8KB combined data & code L1 cache. |
|---|
| 665 | | uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64]; |
|---|
| 666 | | ubyte ways[] = [8, 2, 4, 4, 4, 8, 8, 8, 8]; |
|---|
| 667 | | uint L2size[]= [0, 0, 0, 0, 0, 0, 0, 256, 512]; |
|---|
| 668 | | uint L3size[]= [0, 0, 0, 0, 0, 0, 0, 2048, 0]; |
|---|
| 669 | | |
|---|
| 670 | | datacache[0].size = sizes[cputype]; |
|---|
| 671 | | datacache[0].associativity = ways[cputype]; |
|---|
| 672 | | datacache[0].lineSize = (cputype==PPCG5)? 128 : |
|---|
| 673 | | (cputype == PPC620 || cputype == PPCG3)? 64 : 32; |
|---|
| 674 | | datacache[1].size = L2size[cputype]; |
|---|
| 675 | | datacache[2].size = L3size[cputype]; |
|---|
| 676 | | datacache[1].lineSize = datacache[0].lineSize; |
|---|
| 677 | | datacache[2].lineSize = datacache[0].lineSize; |
|---|
| 678 | 767 | } |
|---|
| 679 | 768 | |
|---|
| 680 | | // TODO: Implement this function with OS support |
|---|
| 681 | | void cpuidSparc() |
|---|
| 682 | | { |
|---|
| 683 | | // UltaSparcIIi : L1 = 16, 2way. L2 = 512, 4 way. |
|---|
| 684 | | // UltraSparcIII : L1 = 64, 4way. L2= 4096 or 8192. |
|---|
| 685 | | // UltraSparcIIIi: L1 = 64, 4way. L2= 1024, 4 way |
|---|
| 686 | | // UltraSparcIV : L1 = 64, 4way. L2 = 16*1024. |
|---|
| 687 | | // UltraSparcIV+ : L1 = 64, 4way. L2 = 2048, L3=32*1024. |
|---|
| 688 | | // Sparc64V : L1 = 128, 2way. L2 = 4096 4way. |
|---|
| 689 | | } |
|---|
| | 769 | static cpuid Cpuid; |
|---|
| 690 | 770 | |
|---|
| 691 | 771 | |
|---|
| 692 | 772 | static this() |
|---|
| 693 | 773 | { |
|---|
| 694 | | if (hasCPUID()) { |
|---|
| 695 | | cpuidX86(); |
|---|
| 696 | | } else { |
|---|
| 697 | | // it's a 386 or 486, or a Cyrix 6x86. |
|---|
| 698 | | //Probably still has an external cache. |
|---|
| 699 | | } |
|---|
| 700 | | if (datacache[0].size==0) { |
|---|
| 701 | | // Guess same as Pentium 1. |
|---|
| 702 | | datacache[0].size = 8; |
|---|
| 703 | | datacache[0].associativity = 2; |
|---|
| 704 | | datacache[0].lineSize = 32; |
|---|
| 705 | | } |
|---|
| 706 | | numCacheLevels = 1; |
|---|
| 707 | | // And now fill up all the unused levels with full memory space. |
|---|
| 708 | | for (int i=1; i< datacache.length; ++i) { |
|---|
| 709 | | if (datacache[i].size==0) { |
|---|
| 710 | | // Set all remaining levels of cache equal to full address space. |
|---|
| 711 | | datacache[i].size = uint.max/1024; |
|---|
| 712 | | datacache[i].associativity = 1; |
|---|
| 713 | | datacache[i].lineSize = datacache[i-1].lineSize; |
|---|
| 714 | | } else numCacheLevels = i+1; |
|---|
| 715 | | } |
|---|
| | 774 | Cpuid.Init(); |
|---|
| 716 | 775 | } |
|---|
| 717 | 776 | |
|---|
| 718 | | |
|---|
| 719 | | |
|---|
| 720 | | |
|---|
| 721 | 777 | debug (Cpuid) |
|---|
| 722 | 778 | { |
|---|
| 723 | 779 | private import tango.io.Stdout; |
|---|
| 724 | 780 | |
|---|
| 725 | 781 | void main() |
|---|
| 726 | 782 | { |
|---|
| 727 | | Stdout.formatln ("{}, {} threads, {} cores", processor, threadsPerCPU, coresPerCPU); |
|---|
| | 783 | Stdout.formatln ("{}, {} threads, {} cores", Cpuid.processor, Cpuid.threadsPerCPU, Cpuid.coresPerCPU); |
|---|
| 728 | 784 | } |
|---|
| 729 | 785 | } |
Download in other formats:
|
 |