Changeset 3889
- Timestamp:
- 08/18/08 15:05:53 (4 months ago)
- Files:
-
- branches/experimental/D2.0/example/networking/selector.d (modified) (8 diffs)
- branches/experimental/D2.0/install/windows/libbz2.lib (copied) (copied from trunk/install/windows/libbz2.lib)
- branches/experimental/D2.0/install/windows/zlib.lib (copied) (copied from trunk/install/windows/zlib.lib)
- branches/experimental/D2.0/lib/compiler/dmd/arraybyte.d (copied) (copied from trunk/lib/compiler/dmd/arraybyte.d)
- branches/experimental/D2.0/lib/compiler/dmd/arraydouble.d (modified) (10 diffs)
- branches/experimental/D2.0/lib/compiler/dmd/arrayfloat.d (modified) (12 diffs)
- branches/experimental/D2.0/lib/compiler/dmd/arrayint.d (copied) (copied from trunk/lib/compiler/dmd/arrayint.d)
- branches/experimental/D2.0/lib/compiler/dmd/arrayreal.d (modified) (6 diffs)
- branches/experimental/D2.0/lib/compiler/dmd/arrayshort.d (copied) (copied from trunk/lib/compiler/dmd/arrayshort.d)
- branches/experimental/D2.0/lib/compiler/dmd/genobj.d (modified) (3 diffs)
- branches/experimental/D2.0/lib/compiler/dmd/posix.mak (modified) (1 diff)
- branches/experimental/D2.0/lib/compiler/dmd/win32.mak (modified) (1 diff)
- branches/experimental/D2.0/lib/compiler/gdc/genobj.d (modified) (3 diffs)
- branches/experimental/D2.0/lib/gc/stub/gc.d (modified) (2 diffs)
- branches/experimental/D2.0/tango/group (deleted)
- branches/experimental/D2.0/tango/io/FileConst.d (deleted)
- branches/experimental/D2.0/tango/io/FileRoots.d (deleted)
- branches/experimental/D2.0/tango/io/GrowBuffer.d (deleted)
- branches/experimental/D2.0/tango/io/Path.d (modified) (1 diff)
- branches/experimental/D2.0/tango/io/selector/AbstractSelector.d (modified) (1 diff)
- branches/experimental/D2.0/tango/math/BigInt.d (copied) (copied from trunk/tango/math/BigInt.d)
- branches/experimental/D2.0/tango/math/Math.d (modified) (1 diff)
- branches/experimental/D2.0/tango/math/internal/BignumNoAsm.d (modified) (2 diffs)
- branches/experimental/D2.0/tango/math/internal/BignumX86.d (modified) (4 diffs)
- branches/experimental/D2.0/tango/math/internal/BiguintCore.d (modified) (11 diffs)
- branches/experimental/D2.0/tango/net/Socket.d (modified) (1 diff)
- branches/experimental/D2.0/tango/net/SocketPool.d (modified) (1 diff)
- branches/experimental/D2.0/tango/net/cluster/tina/Cluster.d (modified) (1 diff)
- branches/experimental/D2.0/tango/net/ftp/Telnet.d (modified) (1 diff)
- branches/experimental/D2.0/tango/net/http/ChunkStream.d (modified) (4 diffs)
- branches/experimental/D2.0/tango/sys/SharedLib.d (modified) (1 diff)
- branches/experimental/D2.0/tango/text/Ascii.d (modified) (1 diff)
- branches/experimental/D2.0/tango/text/Util.d (modified) (2 diffs)
- branches/experimental/D2.0/tango/text/convert/Integer.d (modified) (4 diffs)
- branches/experimental/D2.0/tango/text/convert/TimeStamp.d (modified) (1 diff)
- branches/experimental/D2.0/tango/text/json (copied) (copied from trunk/tango/text/json)
- branches/experimental/D2.0/tango/text/json/Json.d (copied) (copied from trunk/tango/text/json/Json.d)
- branches/experimental/D2.0/tango/text/json/JsonEscape.d (copied) (copied from trunk/tango/text/json/JsonEscape.d)
- branches/experimental/D2.0/tango/text/json/JsonParser.d (copied) (copied from trunk/tango/text/json/JsonParser.d)
- branches/experimental/D2.0/tango/text/stream/StreamIterator.d (modified) (1 diff)
- branches/experimental/D2.0/tango/text/xml/DocEntity.d (copied) (copied from trunk/tango/text/xml/DocEntity.d)
- branches/experimental/D2.0/tango/text/xml/XmlPrinter.d (deleted)
- branches/experimental/D2.0/tango/time/Clock.d (modified) (2 diffs)
- branches/experimental/D2.0/tango/time/ISO8601.d (modified) (1 diff)
- branches/experimental/D2.0/tango/time/Time.d (modified) (10 diffs)
- branches/experimental/D2.0/tango/time/WallClock.d (modified) (4 diffs)
- branches/experimental/D2.0/tango/time/chrono/Calendar.d (modified) (2 diffs)
- branches/experimental/D2.0/tango/time/chrono/Gregorian.d (modified) (5 diffs)
- branches/experimental/D2.0/tango/time/chrono/Hebrew.d (modified) (1 diff)
- branches/experimental/D2.0/tango/time/chrono/Hijri.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/ArrayBag.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/ArraySeq.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/CircularSeq.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/HashMap.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/HashSet.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/LinkMap.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/LinkSeq.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/TreeBag.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/collection/TreeMap.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/container/HashMap.d (modified) (1 diff)
- branches/experimental/D2.0/tango/util/container/LinkedList.d (modified) (3 diffs)
- branches/experimental/D2.0/tango/util/container/SortedMap.d (modified) (5 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/experimental/D2.0/example/networking/selector.d
r3888 r3889 113 113 try 114 114 { 115 TimeSpan timeout = TimeSpan. seconds(1);115 TimeSpan timeout = TimeSpan.fromSeconds(1); 116 116 InternetAddress addr = new InternetAddress(SERVER_ADDR, SERVER_PORT); 117 117 ServerSocket serverSocket = new ServerSocket(addr, 5); … … 139 139 if (eventCount > 0) 140 140 { 141 ISelectable[] removeThese; 141 142 foreach (SelectionKey selectionKey; selector.selectedSet()) 142 143 { … … 178 179 log.trace(sprint("[{0}] Received {1} from client ({2} bytes)", 179 180 i, buffer[0..count], count)); 180 selector.re register(selectionKey.conduit, Event.Write);181 selector.register(selectionKey.conduit, Event.Write); 181 182 receiveCount++; 182 183 } … … 186 187 log.trace(sprint("[{0}] Handle {1} was closed; removing it from Selector", 187 188 i, cast(int) selectionKey.conduit.fileHandle())); 188 selector.unregister(selectionKey.conduit); 189 (cast(SocketConduit) selectionKey.conduit).close(); 189 // note, we cannot unregister because we are 190 // in the middle of a foreach loop. Delay 191 // unregistering and closing until after the 192 // loop is done. 193 //selector.unregister(selectionKey.conduit); 194 //(cast(SocketConduit) selectionKey.conduit).close(); 195 removeThese ~= selectionKey.conduit; 190 196 failedReceiveCount++; 191 197 continue; … … 205 211 log.trace(sprint("[{0}] Sent PONG to client ({1} bytes)", i, count)); 206 212 207 selector.re register(selectionKey.conduit, Event.Read);213 selector.register(selectionKey.conduit, Event.Read); 208 214 sendCount++; 209 215 } … … 213 219 log.trace(sprint("[{0}] Handle {1} was closed; removing it from Selector", 214 220 i, selectionKey.conduit.fileHandle())); 215 selector.unregister(selectionKey.conduit); 216 (cast(SocketConduit) selectionKey.conduit).close(); 221 // note, see comment above 222 //selector.unregister(selectionKey.conduit); 223 //(cast(SocketConduit) selectionKey.conduit).close(); 224 removeThese ~= selectionKey.conduit; 217 225 failedSendCount++; 218 226 continue; … … 246 254 i, cast(int) selectionKey.conduit.fileHandle())); 247 255 } 248 selector.unregister(selectionKey.conduit); 249 (cast(Conduit) selectionKey.conduit).close(); 256 // note, see comment above 257 //selector.unregister(selectionKey.conduit); 258 //(cast(Conduit) selectionKey.conduit).close(); 259 removeThese ~= selectionKey.conduit; 250 260 251 261 if (selectionKey.conduit !is serverSocket) … … 258 268 } 259 269 } 270 } 271 foreach(c; removeThese) 272 { 273 selector.unregister(c); 274 (cast(Conduit) c).close(); 260 275 } 261 276 } branches/experimental/D2.0/lib/compiler/dmd/arraydouble.d
r3886 r3889 1 2 1 /*************************** 3 2 * D programming language http://www.digitalmars.com/d/ … … 9 8 import util.cpuid; 10 9 10 version (Unittest) 11 { 12 /* This is so unit tests will test every CPU variant 13 */ 14 int cpuid; 15 const int CPUID_MAX = 5; 16 bool mmx() { return cpuid == 1 && util.cpuid.mmx(); } 17 bool sse() { return cpuid == 2 && util.cpuid.sse(); } 18 bool sse2() { return cpuid == 3 && util.cpuid.sse2(); } 19 bool amd3dnow() { return cpuid == 4 && util.cpuid.amd3dnow(); } 20 } 21 else 22 { 23 import util.cpuid; 24 alias util.cpuid.mmx mmx; 25 alias util.cpuid.sse sse; 26 alias util.cpuid.sse2 sse2; 27 alias util.cpuid.amd3dnow amd3dnow; 28 } 29 30 //version = log; 31 11 32 bool disjoint(T)(T[] a, T[] b) 12 33 { 13 34 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 14 35 } 36 37 /* Performance figures measured by Burton Radons 38 */ 15 39 16 40 alias double T; … … 25 49 */ 26 50 27 T[] _a dAssAddDouble(T[] a, T[] c, T[] b)51 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) 28 52 in 29 53 { … … 35 59 body 36 60 { 61 auto aptr = a.ptr; 62 auto aend = aptr + a.length; 63 auto bptr = b.ptr; 64 auto cptr = c.ptr; 65 37 66 version (D_InlineAsm_X86) 38 67 { 39 auto aptr = a.ptr; 40 auto aend = aptr + a.length; 41 auto bptr = b.ptr; 42 auto cptr = c.ptr; 43 T* n; 44 45 // SSE version is 333% faster 46 if (util.cpuid.sse2() && b.length >= 16) 47 { 48 n = aptr + (b.length & ~15); 68 // SSE2 version is 333% faster 69 if (sse2() && b.length >= 16) 70 { 71 auto n = aptr + (b.length & ~15); 49 72 50 73 // Unaligned case … … 85 108 } 86 109 } 87 88 // Handle remainder 89 while (aptr < aend) 90 *aptr++ = *bptr++ + *cptr++; 91 } 92 else 93 { 94 for (int i = 0; i < a.length; i++) 95 a[i] = b[i] + c[i]; 96 } 110 } 111 112 // Handle remainder 113 while (aptr < aend) 114 *aptr++ = *bptr++ + *cptr++; 115 97 116 return a; 98 117 } … … 101 120 unittest 102 121 { 103 //printf("_adAssAddDouble unittest\n"); 104 105 { 106 T[] a = [1, 2, 3]; 107 T[] b = [4, 5, 6]; 108 T[3] c; 109 110 c[] = a[] + b[]; 111 assert(c[0] == 5); 112 assert(c[1] == 7); 113 assert(c[2] == 9); 114 } 115 { 116 T[] a = [1, 2, 3, 4, 5, 6, 7, 8, 9]; 117 T[] b = [4, 5, 6, 7, 8, 9, 10, 11, 12]; 118 T[9] c; 119 120 c[] = a[] + b[]; 121 assert(c[0] == 5); 122 assert(c[1] == 7); 123 assert(c[2] == 9); 124 assert(c[3] == 11); 125 assert(c[4] == 13); 126 assert(c[5] == 15); 127 assert(c[6] == 17); 128 assert(c[7] == 19); 129 assert(c[8] == 21); 130 } 131 { 132 const int dim = 35; 133 T[dim] a; 134 T[dim] b; 135 T[dim] c; 136 137 for (int i = 0; i < dim; i++) 138 { a[i] = i; 139 b[i] = i + 7; 140 c[i] = i * 2; 141 } 142 143 c[] = a[] + b[]; 144 145 for (int i = 0; i < dim; i++) 146 { 147 assert(c[i] == a[i] + b[i]); 148 } 149 } 122 printf("_arraySliceSliceAddSliceAssign_d unittest\n"); 123 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 124 { 125 version (log) printf(" cpuid %d\n", cpuid); 126 127 for (int j = 0; j < 2; j++) 128 { 129 const int dim = 67; 130 T[] a = new T[dim + j]; // aligned on 16 byte boundary 131 a = a[j .. dim + j]; // misalign for second iteration 132 T[] b = new T[dim + j]; 133 b = b[j .. dim + j]; 134 T[] c = new T[dim + j]; 135 c = c[j .. dim + j]; 136 137 for (int i = 0; i < dim; i++) 138 { a[i] = cast(T)i; 139 b[i] = cast(T)(i + 7); 140 c[i] = cast(T)(i * 2); 141 } 142 143 c[] = a[] + b[]; 144 145 for (int i = 0; i < dim; i++) 146 { 147 if (c[i] != cast(T)(a[i] + b[i])) 148 { 149 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); 150 assert(0); 151 } 152 } 153 } 154 } 150 155 } 151 156 … … 157 162 */ 158 163 159 T[] _a dAssMinDouble(T[] a, T[] c, T[] b)164 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) 160 165 in 161 166 { … … 167 172 body 168 173 { 174 auto aptr = a.ptr; 175 auto aend = aptr + a.length; 176 auto bptr = b.ptr; 177 auto cptr = c.ptr; 178 169 179 version (D_InlineAsm_X86) 170 180 { 171 auto aptr = a.ptr; 172 auto aend = aptr + a.length; 173 auto bptr = b.ptr; 174 auto cptr = c.ptr; 175 T* n; 176 177 // SSE version is 324% faster 178 if (util.cpuid.sse2() && b.length >= 8) 179 { 180 n = aptr + (b.length & ~7); 181 // SSE2 version is 324% faster 182 if (sse2() && b.length >= 8) 183 { 184 auto n = aptr + (b.length & ~7); 181 185 182 186 // Unaligned case … … 217 221 } 218 222 } 219 220 // Handle remainder 221 while (aptr < aend) 222 *aptr++ = *bptr++ - *cptr++; 223 } 224 else 225 { 226 for (int i = 0; i < a.length; i++) 227 a[i] = b[i] - c[i]; 228 } 223 } 224 225 // Handle remainder 226 while (aptr < aend) 227 *aptr++ = *bptr++ - *cptr++; 228 229 229 return a; 230 230 } … … 233 233 unittest 234 234 { 235 //printf("_adAssMinDouble unittest\n"); 236 237 { 238 T[] a = [1, 2, 3]; 239 T[] b = [4, 5, 6]; 240 T[3] c; 241 242 c[] = a[] - b[]; 243 244 for (int i = 0; i < c.length; i++) 245 { 246 assert(c[i] == a[i] - b[i]); 247 } 248 } 249 { 250 T[] a = [1, 2, 3, 4, 5, 6, 7, 8, 9]; 251 T[] b = [4, 5, 6, 7, 8, 9, 10, 11, 12]; 252 T[9] c; 253 254 c[] = a[] - b[]; 255 256 for (int i = 0; i < c.length; i++) 257 { 258 assert(c[i] == a[i] - b[i]); 259 } 260 } 261 { 262 const int dim = 35; 263 T[dim] a; 264 T[dim] b; 265 T[dim] c; 266 267 for (int i = 0; i < dim; i++) 268 { a[i] = i; 269 b[i] = i + 7; 270 c[i] = i * 2; 271 } 272 273 c[] = a[] - b[]; 274 275 for (int i = 0; i < dim; i++) 276 { 277 assert(c[i] == a[i] - b[i]); 278 } 279 } 280 } 281 282 235 printf("_arraySliceSliceMinSliceAssign_d unittest\n"); 236 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 237 { 238 version (log) printf(" cpuid %d\n", cpuid); 239 240 for (int j = 0; j < 2; j++) 241 { 242 const int dim = 67; 243 T[] a = new T[dim + j]; // aligned on 16 byte boundary 244 a = a[j .. dim + j]; // misalign for second iteration 245 T[] b = new T[dim + j]; 246 b = b[j .. dim + j]; 247 T[] c = new T[dim + j]; 248 c = c[j .. dim + j]; 249 250 for (int i = 0; i < dim; i++) 251 { a[i] = cast(T)i; 252 b[i] = cast(T)(i + 7); 253 c[i] = cast(T)(i * 2); 254 } 255 256 c[] = a[] - b[]; 257 258 for (int i = 0; i < dim; i++) 259 { 260 if (c[i] != cast(T)(a[i] - b[i])) 261 { 262 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); 263 assert(0); 264 } 265 } 266 } 267 } 268 } 269 270 271 /* ======================================================================== */ 272 273 /*********************** 274 * Computes: 275 * a[] = b[] + value 276 */ 277 278 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) 279 in 280 { 281 assert(a.length == b.length); 282 assert(disjoint(a, b)); 283 } 284 body 285 { 286 //printf("_arraySliceExpAddSliceAssign_d()\n"); 287 auto aptr = a.ptr; 288 auto aend = aptr + a.length; 289 auto bptr = b.ptr; 290 291 version (D_InlineAsm_X86) 292 { 293 // SSE2 version is 305% faster 294 if (sse2() && a.length >= 8) 295 { 296 auto n = aptr + (a.length & ~7); 297 298 // Unaligned case 299 asm 300 { 301 mov EAX, bptr; 302 mov ESI, aptr; 303 mov EDI, n; 304 movsd XMM4, value; 305 shufpd XMM4, XMM4, 0; 306 307 align 8; 308 startsseloop: 309 add ESI, 64; 310 movupd XMM0, [EAX]; 311 movupd XMM1, [EAX+16]; 312 movupd XMM2, [EAX+32]; 313 movupd XMM3, [EAX+48]; 314 add EAX, 64; 315 addpd XMM0, XMM4; 316 addpd XMM1, XMM4; 317 addpd XMM2, XMM4; 318 addpd XMM3, XMM4; 319 movupd [ESI+ 0-64], XMM0; 320 movupd [ESI+16-64], XMM1; 321 movupd [ESI+32-64], XMM2; 322 movupd [ESI+48-64], XMM3; 323 cmp ESI, EDI; 324 jb startsseloop; 325 326 mov aptr, ESI; 327 mov bptr, EAX; 328 } 329 } 330 } 331 332 while (aptr < aend) 333 *aptr++ = *bptr++ + value; 334 335 return a; 336 } 337 338 unittest 339 { 340 printf("_arraySliceExpAddSliceAssign_d unittest\n"); 341 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 342 { 343 version (log) printf(" cpuid %d\n", cpuid); 344 345 for (int j = 0; j < 2; j++) 346 { 347 const int dim = 67; 348 T[] a = new T[dim + j]; // aligned on 16 byte boundary 349 a = a[j .. dim + j]; // misalign for second iteration 350 T[] b = new T[dim + j]; 351 b = b[j .. dim + j]; 352 T[] c = new T[dim + j]; 353 c = c[j .. dim + j]; 354 355 for (int i = 0; i < dim; i++) 356 { a[i] = cast(T)i; 357 b[i] = cast(T)(i + 7); 358 c[i] = cast(T)(i * 2); 359 } 360 361 c[] = a[] + 6; 362 363 for (int i = 0; i < dim; i++) 364 { 365 if (c[i] != cast(T)(a[i] + 6)) 366 { 367 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); 368 assert(0); 369 } 370 } 371 } 372 } 373 } 374 375 /* ======================================================================== */ 376 377 /*********************** 378 * Computes: 379 * a[] += value 380 */ 381 382 T[] _arrayExpSliceAddass_d(T[] a, T value) 383 { 384 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 385 auto aptr = a.ptr; 386 auto aend = aptr + a.length; 387 388 version (D_InlineAsm_X86) 389 { 390 // SSE2 version is 114% faster 391 if (sse2() && a.length >= 8) 392 { 393 auto n = cast(T*)((cast(uint)aend) & ~7); 394 if (aptr < n) 395 396 // Unaligned case 397 asm 398 { 399 mov ESI, aptr; 400 mov EDI, n; 401 movsd XMM4, value; 402 shufpd XMM4, XMM4, 0; 403 404 align 8; 405 startsseloopa: 406 movupd XMM0, [ESI]; 407 movupd XMM1, [ESI+16]; 408 movupd XMM2, [ESI+32]; 409 movupd XMM3, [ESI+48]; 410 add ESI, 64; 411 addpd XMM0, XMM4; 412 addpd XMM1, XMM4; 413 addpd XMM2, XMM4; 414 addpd XMM3, XMM4; 415 movupd [ESI+ 0-64], XMM0; 416 movupd [ESI+16-64], XMM1; 417 movupd [ESI+32-64], XMM2; 418 movupd [ESI+48-64], XMM3; 419 cmp ESI, EDI; 420 jb startsseloopa; 421 422 mov aptr, ESI; 423 } 424 } 425 } 426 427 while (aptr < aend) 428 *aptr++ += value; 429 430 return a; 431 } 432 433 unittest 434 { 435 printf("_arrayExpSliceAddass_d unittest\n"); 436 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 437 { 438 version (log) printf(" cpuid %d\n", cpuid); 439 440 for (int j = 0; j < 2; j++) 441 { 442 const int dim = 67; 443 T[] a = new T[dim + j]; // aligned on 16 byte boundary 444 a = a[j .. dim + j]; // misalign for second iteration 445 T[] b = new T[dim + j]; 446 b = b[j .. dim + j]; 447 T[] c = new T[dim + j]; 448 c = c[j .. dim + j]; 449 450 for (int i = 0; i < dim; i++) 451 { a[i] = cast(T)i; 452 b[i] = cast(T)(i + 7); 453 c[i] = cast(T)(i * 2); 454 } 455 456 a[] = c[]; 457 c[] += 6; 458 459 for (int i = 0; i < dim; i++) 460 { 461 if (c[i] != cast(T)(a[i] + 6)) 462 { 463 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); 464 assert(0); 465 } 466 } 467 } 468 } 469 } 470 471 /* ======================================================================== */ 472 473 /*********************** 474 * Computes: 475 * a[] += b[] 476 */ 477 478 T[] _arraySliceSliceAddass_d(T[] a, T[] b) 479 in 480 { 481 assert (a.length == b.length); 482 assert (disjoint(a, b)); 483 } 484 body 485 { 486 //printf("_arraySliceSliceAddass_d()\n"); 487 auto aptr = a.ptr; 488 auto aend = aptr + a.length; 489 auto bptr = b.ptr; 490 491 version (D_InlineAsm_X86) 492 { 493 // SSE2 version is 183% faster 494 if (sse2() && a.length >= 8) 495 { 496 auto n = aptr + (a.length & ~7); 497 498 // Unaligned case 499 asm 500 { 501 mov ECX, bptr; // right operand 502 mov ESI, aptr; // destination operand 503 mov EDI, n; // end comparison 504 505 align 8; 506 startsseloopb: 507 movupd XMM0, [ESI]; 508 movupd XMM1, [ESI+16]; 509 movupd XMM2, [ESI+32]; 510 movupd XMM3, [ESI+48]; 511 add ESI, 64; 512 movupd XMM4, [ECX]; 513 movupd XMM5, [ECX+16]; 514 movupd XMM6, [ECX+32]; 515 movupd XMM7, [ECX+48]; 516 add ECX, 64; 517 addpd XMM0, XMM4; 518 addpd XMM1, XMM5; 519 addpd XMM2, XMM6; 520 addpd XMM3, XMM7; 521 movupd [ESI+ 0-64], XMM0; 522 movupd [ESI+16-64], XMM1; 523 movupd [ESI+32-64], XMM2; 524 movupd [ESI+48-64], XMM3; 525 cmp ESI, EDI; 526 jb startsseloopb; 527 528 mov aptr, ESI; 529 mov bptr, ECX; 530 } 531 } 532 } 533 534 while (aptr < aend) 535 *aptr++ += *bptr++; 536 537 return a; 538 } 539 540 unittest 541 { 542 printf("_arraySliceSliceAddass_d unittest\n"); 543 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 544 { 545 version (log) printf(" cpuid %d\n", cpuid); 546 547 for (int j = 0; j < 2; j++) 548 { 549 const int dim = 67; 550 T[] a = new T[dim + j]; // aligned on 16 byte boundary 551 a = a[j .. dim + j]; // misalign for second iteration 552 T[] b = new T[dim + j]; 553 b = b[j .. dim + j]; 554 T[] c = new T[dim + j]; 555 c = c[j .. dim + j]; 556 557 for (int i = 0; i < dim; i++) 558 { a[i] = cast(T)i; 559 b[i] = cast(T)(i + 7); 560 c[i] = cast(T)(i * 2); 561 } 562 563 a[] = c[]; 564 c[] += b[]; 565 566 for (int i = 0; i < dim; i++) 567 { 568 if (c[i] != cast(T)(a[i] + b[i])) 569 { 570 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); 571 assert(0); 572 } 573 } 574 } 575 } 576 } 577 578 /* ======================================================================== */ 579 580 /*********************** 581 * Computes: 582 * a[] = b[] - value 583 */ 584 585 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) 586 in 587 { 588 assert (a.length == b.length); 589 assert (disjoint(a, b)); 590 } 591 body 592 { 593 //printf("_arraySliceExpMinSliceAssign_d()\n"); 594 auto aptr = a.ptr; 595 auto aend = aptr + a.length; 596 auto bptr = b.ptr; 597 598 version (D_InlineAsm_X86) 599 { 600 // SSE2 version is 305% faster 601 if (sse2() && a.length >= 8) 602 { 603 auto n = aptr + (a.length & ~7); 604 605 // Unaligned case 606 asm 607 { 608 mov EAX, bptr; 609 mov ESI, aptr; 610 mov EDI, n; 611 movsd XMM4, value; 612 shufpd XMM4, XMM4, 0; 613 614 align 8; 615 startsseloop: 616 add ESI, 64; 617 movupd XMM0, [EAX]; 618 movupd XMM1, [EAX+16]; 619 movupd XMM2, [EAX+32]; 620 movupd XMM3, [EAX+48]; 621 add EAX, 64; 622 subpd XMM0, XMM4; 623 subpd XMM1, XMM4; 624 subpd XMM2, XMM4; 625 subpd XMM3, XMM4; 626 movupd [ESI+ 0-64], XMM0; 627 movupd [ESI+16-64], XMM1; 628 movupd [ESI+32-64], XMM2; 629 movupd [ESI+48-64], XMM3; 630 cmp ESI, EDI; 631 jb startsseloop; 632 633 mov aptr, ESI; 634 mov bptr, EAX; 635 } 636 } 637 } 638 639 while (aptr < aend) 640 *aptr++ = *bptr++ - value; 641 642 return a; 643 } 644 645 unittest 646 { 647 printf("_arraySliceExpMinSliceAssign_d unittest\n"); 648 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 649 { 650 version (log) printf(" cpuid %d\n", cpuid); 651 652 for (int j = 0; j < 2; j++) 653 { 654 const int dim = 67; 655 T[] a = new T[dim + j]; // aligned on 16 byte boundary 656 a = a[j .. dim + j]; // misalign for second iteration 657 T[] b = new T[dim + j]; 658 b = b[j .. dim + j]; 659 T[] c = new T[dim + j]; 660 c = c[j .. dim + j]; 661 662 for (int i = 0; i < dim; i++) 663 { a[i] = cast(T)i; 664 b[i] = cast(T)(i + 7); 665 c[i] = cast(T)(i * 2); 666 } 667 668 c[] = a[] - 6; 669 670 for (int i = 0; i < dim; i++) 671 { 672 if (c[i] != cast(T)(a[i] - 6)) 673 { 674 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 675 assert(0); 676 } 677 } 678 } 679 } 680 } 681 682 /* ======================================================================== */ 683 684 /*********************** 685 * Computes: 686 * a[] = value - b[] 687 */ 688 689 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) 690 in 691 { 692 assert (a.length == b.length); 693 assert (disjoint(a, b)); 694 } 695 body 696 { 697 //printf("_arrayExpSliceMinSliceAssign_d()\n"); 698 auto aptr = a.ptr; 699 auto aend = aptr + a.length; 700 auto bptr = b.ptr; 701 702 version (D_InlineAsm_X86) 703 { 704 // SSE2 version is 66% faster 705 if (sse2() && a.length >= 8) 706 { 707 auto n = aptr + (a.length & ~7); 708 709 // Unaligned case 710 asm 711 { 712 mov EAX, bptr; 713 mov ESI, aptr; 714 mov EDI, n; 715 movsd XMM4, value; 716 shufpd XMM4, XMM4, 0; 717 718 align 8; 719 startsseloop: 720 add ESI, 64; 721 movapd XMM5, XMM4; 722 movapd XMM6, XMM4; 723 movupd XMM0, [EAX]; 724 movupd XMM1, [EAX+16]; 725 movupd XMM2, [EAX+32]; 726 movupd XMM3, [EAX+48]; 727 add EAX, 64; 728 subpd XMM5, XMM0; 729 subpd XMM6, XMM1; 730 movupd [ESI+ 0-64], XMM5; 731 movupd [ESI+16-64], XMM6; 732 movapd XMM5, XMM4; 733 movapd XMM6, XMM4; 734 subpd XMM5, XMM2; 735 subpd XMM6, XMM3; 736 movupd [ESI+32-64], XMM5; 737 movupd [ESI+48-64], XMM6; 738 cmp ESI, EDI; 739 jb startsseloop; 740 741 mov aptr, ESI; 742 mov bptr, EAX; 743 } 744 } 745 } 746 747 while (aptr < aend) 748 *aptr++ = value - *bptr++; 749 750 return a; 751 } 752 753 unittest 754 { 755 printf("_arrayExpSliceMinSliceAssign_d unittest\n"); 756 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 757 { 758 version (log) printf(" cpuid %d\n", cpuid); 759 760 for (int j = 0; j < 2; j++) 761 {





