| 235 | | //printf("_adAssMinDouble unittest\n"); |
|---|
| 236 | | |
|---|
| 237 | | { |
|---|
| 238 | | T[] a = [1, 2, 3]; |
|---|
| 239 | | T[] b = [4, 5, 6]; |
|---|
| 240 | | T[3] c; |
|---|
| 241 | | |
|---|
| 242 | | c[] = a[] - b[]; |
|---|
| 243 | | |
|---|
| 244 | | for (int i = 0; i < c.length; i++) |
|---|
| 245 | | { |
|---|
| 246 | | assert(c[i] == a[i] - b[i]); |
|---|
| 247 | | } |
|---|
| 248 | | } |
|---|
| 249 | | { |
|---|
| 250 | | T[] a = [1, 2, 3, 4, 5, 6, 7, 8, 9]; |
|---|
| 251 | | T[] b = [4, 5, 6, 7, 8, 9, 10, 11, 12]; |
|---|
| 252 | | T[9] c; |
|---|
| 253 | | |
|---|
| 254 | | c[] = a[] - b[]; |
|---|
| 255 | | |
|---|
| 256 | | for (int i = 0; i < c.length; i++) |
|---|
| 257 | | { |
|---|
| 258 | | assert(c[i] == a[i] - b[i]); |
|---|
| 259 | | } |
|---|
| 260 | | } |
|---|
| 261 | | { |
|---|
| 262 | | const int dim = 35; |
|---|
| 263 | | T[dim] a; |
|---|
| 264 | | T[dim] b; |
|---|
| 265 | | T[dim] c; |
|---|
| 266 | | |
|---|
| 267 | | for (int i = 0; i < dim; i++) |
|---|
| 268 | | { a[i] = i; |
|---|
| 269 | | b[i] = i + 7; |
|---|
| 270 | | c[i] = i * 2; |
|---|
| 271 | | } |
|---|
| 272 | | |
|---|
| 273 | | c[] = a[] - b[]; |
|---|
| 274 | | |
|---|
| 275 | | for (int i = 0; i < dim; i++) |
|---|
| 276 | | { |
|---|
| 277 | | assert(c[i] == a[i] - b[i]); |
|---|
| 278 | | } |
|---|
| 279 | | } |
|---|
| 280 | | } |
|---|
| 281 | | |
|---|
| 282 | | |
|---|
| | 235 | printf("_arraySliceSliceMinSliceAssign_d unittest\n"); |
|---|
| | 236 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 237 | { |
|---|
| | 238 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 239 | |
|---|
| | 240 | for (int j = 0; j < 2; j++) |
|---|
| | 241 | { |
|---|
| | 242 | const int dim = 67; |
|---|
| | 243 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 244 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 245 | T[] b = new T[dim + j]; |
|---|
| | 246 | b = b[j .. dim + j]; |
|---|
| | 247 | T[] c = new T[dim + j]; |
|---|
| | 248 | c = c[j .. dim + j]; |
|---|
| | 249 | |
|---|
| | 250 | for (int i = 0; i < dim; i++) |
|---|
| | 251 | { a[i] = cast(T)i; |
|---|
| | 252 | b[i] = cast(T)(i + 7); |
|---|
| | 253 | c[i] = cast(T)(i * 2); |
|---|
| | 254 | } |
|---|
| | 255 | |
|---|
| | 256 | c[] = a[] - b[]; |
|---|
| | 257 | |
|---|
| | 258 | for (int i = 0; i < dim; i++) |
|---|
| | 259 | { |
|---|
| | 260 | if (c[i] != cast(T)(a[i] - b[i])) |
|---|
| | 261 | { |
|---|
| | 262 | printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); |
|---|
| | 263 | assert(0); |
|---|
| | 264 | } |
|---|
| | 265 | } |
|---|
| | 266 | } |
|---|
| | 267 | } |
|---|
| | 268 | } |
|---|
| | 269 | |
|---|
| | 270 | |
|---|
| | 271 | /* ======================================================================== */ |
|---|
| | 272 | |
|---|
| | 273 | /*********************** |
|---|
| | 274 | * Computes: |
|---|
| | 275 | * a[] = b[] + value |
|---|
| | 276 | */ |
|---|
| | 277 | |
|---|
| | 278 | T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) |
|---|
| | 279 | in |
|---|
| | 280 | { |
|---|
| | 281 | assert(a.length == b.length); |
|---|
| | 282 | assert(disjoint(a, b)); |
|---|
| | 283 | } |
|---|
| | 284 | body |
|---|
| | 285 | { |
|---|
| | 286 | //printf("_arraySliceExpAddSliceAssign_d()\n"); |
|---|
| | 287 | auto aptr = a.ptr; |
|---|
| | 288 | auto aend = aptr + a.length; |
|---|
| | 289 | auto bptr = b.ptr; |
|---|
| | 290 | |
|---|
| | 291 | version (D_InlineAsm_X86) |
|---|
| | 292 | { |
|---|
| | 293 | // SSE2 version is 305% faster |
|---|
| | 294 | if (sse2() && a.length >= 8) |
|---|
| | 295 | { |
|---|
| | 296 | auto n = aptr + (a.length & ~7); |
|---|
| | 297 | |
|---|
| | 298 | // Unaligned case |
|---|
| | 299 | asm |
|---|
| | 300 | { |
|---|
| | 301 | mov EAX, bptr; |
|---|
| | 302 | mov ESI, aptr; |
|---|
| | 303 | mov EDI, n; |
|---|
| | 304 | movsd XMM4, value; |
|---|
| | 305 | shufpd XMM4, XMM4, 0; |
|---|
| | 306 | |
|---|
| | 307 | align 8; |
|---|
| | 308 | startsseloop: |
|---|
| | 309 | add ESI, 64; |
|---|
| | 310 | movupd XMM0, [EAX]; |
|---|
| | 311 | movupd XMM1, [EAX+16]; |
|---|
| | 312 | movupd XMM2, [EAX+32]; |
|---|
| | 313 | movupd XMM3, [EAX+48]; |
|---|
| | 314 | add EAX, 64; |
|---|
| | 315 | addpd XMM0, XMM4; |
|---|
| | 316 | addpd XMM1, XMM4; |
|---|
| | 317 | addpd XMM2, XMM4; |
|---|
| | 318 | addpd XMM3, XMM4; |
|---|
| | 319 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 320 | movupd [ESI+16-64], XMM1; |
|---|
| | 321 | movupd [ESI+32-64], XMM2; |
|---|
| | 322 | movupd [ESI+48-64], XMM3; |
|---|
| | 323 | cmp ESI, EDI; |
|---|
| | 324 | jb startsseloop; |
|---|
| | 325 | |
|---|
| | 326 | mov aptr, ESI; |
|---|
| | 327 | mov bptr, EAX; |
|---|
| | 328 | } |
|---|
| | 329 | } |
|---|
| | 330 | } |
|---|
| | 331 | |
|---|
| | 332 | while (aptr < aend) |
|---|
| | 333 | *aptr++ = *bptr++ + value; |
|---|
| | 334 | |
|---|
| | 335 | return a; |
|---|
| | 336 | } |
|---|
| | 337 | |
|---|
| | 338 | unittest |
|---|
| | 339 | { |
|---|
| | 340 | printf("_arraySliceExpAddSliceAssign_d unittest\n"); |
|---|
| | 341 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 342 | { |
|---|
| | 343 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 344 | |
|---|
| | 345 | for (int j = 0; j < 2; j++) |
|---|
| | 346 | { |
|---|
| | 347 | const int dim = 67; |
|---|
| | 348 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 349 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 350 | T[] b = new T[dim + j]; |
|---|
| | 351 | b = b[j .. dim + j]; |
|---|
| | 352 | T[] c = new T[dim + j]; |
|---|
| | 353 | c = c[j .. dim + j]; |
|---|
| | 354 | |
|---|
| | 355 | for (int i = 0; i < dim; i++) |
|---|
| | 356 | { a[i] = cast(T)i; |
|---|
| | 357 | b[i] = cast(T)(i + 7); |
|---|
| | 358 | c[i] = cast(T)(i * 2); |
|---|
| | 359 | } |
|---|
| | 360 | |
|---|
| | 361 | c[] = a[] + 6; |
|---|
| | 362 | |
|---|
| | 363 | for (int i = 0; i < dim; i++) |
|---|
| | 364 | { |
|---|
| | 365 | if (c[i] != cast(T)(a[i] + 6)) |
|---|
| | 366 | { |
|---|
| | 367 | printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); |
|---|
| | 368 | assert(0); |
|---|
| | 369 | } |
|---|
| | 370 | } |
|---|
| | 371 | } |
|---|
| | 372 | } |
|---|
| | 373 | } |
|---|
| | 374 | |
|---|
| | 375 | /* ======================================================================== */ |
|---|
| | 376 | |
|---|
| | 377 | /*********************** |
|---|
| | 378 | * Computes: |
|---|
| | 379 | * a[] += value |
|---|
| | 380 | */ |
|---|
| | 381 | |
|---|
| | 382 | T[] _arrayExpSliceAddass_d(T[] a, T value) |
|---|
| | 383 | { |
|---|
| | 384 | //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); |
|---|
| | 385 | auto aptr = a.ptr; |
|---|
| | 386 | auto aend = aptr + a.length; |
|---|
| | 387 | |
|---|
| | 388 | version (D_InlineAsm_X86) |
|---|
| | 389 | { |
|---|
| | 390 | // SSE2 version is 114% faster |
|---|
| | 391 | if (sse2() && a.length >= 8) |
|---|
| | 392 | { |
|---|
| | 393 | auto n = cast(T*)((cast(uint)aend) & ~7); |
|---|
| | 394 | if (aptr < n) |
|---|
| | 395 | |
|---|
| | 396 | // Unaligned case |
|---|
| | 397 | asm |
|---|
| | 398 | { |
|---|
| | 399 | mov ESI, aptr; |
|---|
| | 400 | mov EDI, n; |
|---|
| | 401 | movsd XMM4, value; |
|---|
| | 402 | shufpd XMM4, XMM4, 0; |
|---|
| | 403 | |
|---|
| | 404 | align 8; |
|---|
| | 405 | startsseloopa: |
|---|
| | 406 | movupd XMM0, [ESI]; |
|---|
| | 407 | movupd XMM1, [ESI+16]; |
|---|
| | 408 | movupd XMM2, [ESI+32]; |
|---|
| | 409 | movupd XMM3, [ESI+48]; |
|---|
| | 410 | add ESI, 64; |
|---|
| | 411 | addpd XMM0, XMM4; |
|---|
| | 412 | addpd XMM1, XMM4; |
|---|
| | 413 | addpd XMM2, XMM4; |
|---|
| | 414 | addpd XMM3, XMM4; |
|---|
| | 415 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 416 | movupd [ESI+16-64], XMM1; |
|---|
| | 417 | movupd [ESI+32-64], XMM2; |
|---|
| | 418 | movupd [ESI+48-64], XMM3; |
|---|
| | 419 | cmp ESI, EDI; |
|---|
| | 420 | jb startsseloopa; |
|---|
| | 421 | |
|---|
| | 422 | mov aptr, ESI; |
|---|
| | 423 | } |
|---|
| | 424 | } |
|---|
| | 425 | } |
|---|
| | 426 | |
|---|
| | 427 | while (aptr < aend) |
|---|
| | 428 | *aptr++ += value; |
|---|
| | 429 | |
|---|
| | 430 | return a; |
|---|
| | 431 | } |
|---|
| | 432 | |
|---|
| | 433 | unittest |
|---|
| | 434 | { |
|---|
| | 435 | printf("_arrayExpSliceAddass_d unittest\n"); |
|---|
| | 436 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 437 | { |
|---|
| | 438 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 439 | |
|---|
| | 440 | for (int j = 0; j < 2; j++) |
|---|
| | 441 | { |
|---|
| | 442 | const int dim = 67; |
|---|
| | 443 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 444 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 445 | T[] b = new T[dim + j]; |
|---|
| | 446 | b = b[j .. dim + j]; |
|---|
| | 447 | T[] c = new T[dim + j]; |
|---|
| | 448 | c = c[j .. dim + j]; |
|---|
| | 449 | |
|---|
| | 450 | for (int i = 0; i < dim; i++) |
|---|
| | 451 | { a[i] = cast(T)i; |
|---|
| | 452 | b[i] = cast(T)(i + 7); |
|---|
| | 453 | c[i] = cast(T)(i * 2); |
|---|
| | 454 | } |
|---|
| | 455 | |
|---|
| | 456 | a[] = c[]; |
|---|
| | 457 | c[] += 6; |
|---|
| | 458 | |
|---|
| | 459 | for (int i = 0; i < dim; i++) |
|---|
| | 460 | { |
|---|
| | 461 | if (c[i] != cast(T)(a[i] + 6)) |
|---|
| | 462 | { |
|---|
| | 463 | printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); |
|---|
| | 464 | assert(0); |
|---|
| | 465 | } |
|---|
| | 466 | } |
|---|
| | 467 | } |
|---|
| | 468 | } |
|---|
| | 469 | } |
|---|
| | 470 | |
|---|
| | 471 | /* ======================================================================== */ |
|---|
| | 472 | |
|---|
| | 473 | /*********************** |
|---|
| | 474 | * Computes: |
|---|
| | 475 | * a[] += b[] |
|---|
| | 476 | */ |
|---|
| | 477 | |
|---|
| | 478 | T[] _arraySliceSliceAddass_d(T[] a, T[] b) |
|---|
| | 479 | in |
|---|
| | 480 | { |
|---|
| | 481 | assert (a.length == b.length); |
|---|
| | 482 | assert (disjoint(a, b)); |
|---|
| | 483 | } |
|---|
| | 484 | body |
|---|
| | 485 | { |
|---|
| | 486 | //printf("_arraySliceSliceAddass_d()\n"); |
|---|
| | 487 | auto aptr = a.ptr; |
|---|
| | 488 | auto aend = aptr + a.length; |
|---|
| | 489 | auto bptr = b.ptr; |
|---|
| | 490 | |
|---|
| | 491 | version (D_InlineAsm_X86) |
|---|
| | 492 | { |
|---|
| | 493 | // SSE2 version is 183% faster |
|---|
| | 494 | if (sse2() && a.length >= 8) |
|---|
| | 495 | { |
|---|
| | 496 | auto n = aptr + (a.length & ~7); |
|---|
| | 497 | |
|---|
| | 498 | // Unaligned case |
|---|
| | 499 | asm |
|---|
| | 500 | { |
|---|
| | 501 | mov ECX, bptr; // right operand |
|---|
| | 502 | mov ESI, aptr; // destination operand |
|---|
| | 503 | mov EDI, n; // end comparison |
|---|
| | 504 | |
|---|
| | 505 | align 8; |
|---|
| | 506 | startsseloopb: |
|---|
| | 507 | movupd XMM0, [ESI]; |
|---|
| | 508 | movupd XMM1, [ESI+16]; |
|---|
| | 509 | movupd XMM2, [ESI+32]; |
|---|
| | 510 | movupd XMM3, [ESI+48]; |
|---|
| | 511 | add ESI, 64; |
|---|
| | 512 | movupd XMM4, [ECX]; |
|---|
| | 513 | movupd XMM5, [ECX+16]; |
|---|
| | 514 | movupd XMM6, [ECX+32]; |
|---|
| | 515 | movupd XMM7, [ECX+48]; |
|---|
| | 516 | add ECX, 64; |
|---|
| | 517 | addpd XMM0, XMM4; |
|---|
| | 518 | addpd XMM1, XMM5; |
|---|
| | 519 | addpd XMM2, XMM6; |
|---|
| | 520 | addpd XMM3, XMM7; |
|---|
| | 521 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 522 | movupd [ESI+16-64], XMM1; |
|---|
| | 523 | movupd [ESI+32-64], XMM2; |
|---|
| | 524 | movupd [ESI+48-64], XMM3; |
|---|
| | 525 | cmp ESI, EDI; |
|---|
| | 526 | jb startsseloopb; |
|---|
| | 527 | |
|---|
| | 528 | mov aptr, ESI; |
|---|
| | 529 | mov bptr, ECX; |
|---|
| | 530 | } |
|---|
| | 531 | } |
|---|
| | 532 | } |
|---|
| | 533 | |
|---|
| | 534 | while (aptr < aend) |
|---|
| | 535 | *aptr++ += *bptr++; |
|---|
| | 536 | |
|---|
| | 537 | return a; |
|---|
| | 538 | } |
|---|
| | 539 | |
|---|
| | 540 | unittest |
|---|
| | 541 | { |
|---|
| | 542 | printf("_arraySliceSliceAddass_d unittest\n"); |
|---|
| | 543 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 544 | { |
|---|
| | 545 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 546 | |
|---|
| | 547 | for (int j = 0; j < 2; j++) |
|---|
| | 548 | { |
|---|
| | 549 | const int dim = 67; |
|---|
| | 550 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 551 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 552 | T[] b = new T[dim + j]; |
|---|
| | 553 | b = b[j .. dim + j]; |
|---|
| | 554 | T[] c = new T[dim + j]; |
|---|
| | 555 | c = c[j .. dim + j]; |
|---|
| | 556 | |
|---|
| | 557 | for (int i = 0; i < dim; i++) |
|---|
| | 558 | { a[i] = cast(T)i; |
|---|
| | 559 | b[i] = cast(T)(i + 7); |
|---|
| | 560 | c[i] = cast(T)(i * 2); |
|---|
| | 561 | } |
|---|
| | 562 | |
|---|
| | 563 | a[] = c[]; |
|---|
| | 564 | c[] += b[]; |
|---|
| | 565 | |
|---|
| | 566 | for (int i = 0; i < dim; i++) |
|---|
| | 567 | { |
|---|
| | 568 | if (c[i] != cast(T)(a[i] + b[i])) |
|---|
| | 569 | { |
|---|
| | 570 | printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); |
|---|
| | 571 | assert(0); |
|---|
| | 572 | } |
|---|
| | 573 | } |
|---|
| | 574 | } |
|---|
| | 575 | } |
|---|
| | 576 | } |
|---|
| | 577 | |
|---|
| | 578 | /* ======================================================================== */ |
|---|
| | 579 | |
|---|
| | 580 | /*********************** |
|---|
| | 581 | * Computes: |
|---|
| | 582 | * a[] = b[] - value |
|---|
| | 583 | */ |
|---|
| | 584 | |
|---|
| | 585 | T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) |
|---|
| | 586 | in |
|---|
| | 587 | { |
|---|
| | 588 | assert (a.length == b.length); |
|---|
| | 589 | assert (disjoint(a, b)); |
|---|
| | 590 | } |
|---|
| | 591 | body |
|---|
| | 592 | { |
|---|
| | 593 | //printf("_arraySliceExpMinSliceAssign_d()\n"); |
|---|
| | 594 | auto aptr = a.ptr; |
|---|
| | 595 | auto aend = aptr + a.length; |
|---|
| | 596 | auto bptr = b.ptr; |
|---|
| | 597 | |
|---|
| | 598 | version (D_InlineAsm_X86) |
|---|
| | 599 | { |
|---|
| | 600 | // SSE2 version is 305% faster |
|---|
| | 601 | if (sse2() && a.length >= 8) |
|---|
| | 602 | { |
|---|
| | 603 | auto n = aptr + (a.length & ~7); |
|---|
| | 604 | |
|---|
| | 605 | // Unaligned case |
|---|
| | 606 | asm |
|---|
| | 607 | { |
|---|
| | 608 | mov EAX, bptr; |
|---|
| | 609 | mov ESI, aptr; |
|---|
| | 610 | mov EDI, n; |
|---|
| | 611 | movsd XMM4, value; |
|---|
| | 612 | shufpd XMM4, XMM4, 0; |
|---|
| | 613 | |
|---|
| | 614 | align 8; |
|---|
| | 615 | startsseloop: |
|---|
| | 616 | add ESI, 64; |
|---|
| | 617 | movupd XMM0, [EAX]; |
|---|
| | 618 | movupd XMM1, [EAX+16]; |
|---|
| | 619 | movupd XMM2, [EAX+32]; |
|---|
| | 620 | movupd XMM3, [EAX+48]; |
|---|
| | 621 | add EAX, 64; |
|---|
| | 622 | subpd XMM0, XMM4; |
|---|
| | 623 | subpd XMM1, XMM4; |
|---|
| | 624 | subpd XMM2, XMM4; |
|---|
| | 625 | subpd XMM3, XMM4; |
|---|
| | 626 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 627 | movupd [ESI+16-64], XMM1; |
|---|
| | 628 | movupd [ESI+32-64], XMM2; |
|---|
| | 629 | movupd [ESI+48-64], XMM3; |
|---|
| | 630 | cmp ESI, EDI; |
|---|
| | 631 | jb startsseloop; |
|---|
| | 632 | |
|---|
| | 633 | mov aptr, ESI; |
|---|
| | 634 | mov bptr, EAX; |
|---|
| | 635 | } |
|---|
| | 636 | } |
|---|
| | 637 | } |
|---|
| | 638 | |
|---|
| | 639 | while (aptr < aend) |
|---|
| | 640 | *aptr++ = *bptr++ - value; |
|---|
| | 641 | |
|---|
| | 642 | return a; |
|---|
| | 643 | } |
|---|
| | 644 | |
|---|
| | 645 | unittest |
|---|
| | 646 | { |
|---|
| | 647 | printf("_arraySliceExpMinSliceAssign_d unittest\n"); |
|---|
| | 648 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 649 | { |
|---|
| | 650 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 651 | |
|---|
| | 652 | for (int j = 0; j < 2; j++) |
|---|
| | 653 | { |
|---|
| | 654 | const int dim = 67; |
|---|
| | 655 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 656 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 657 | T[] b = new T[dim + j]; |
|---|
| | 658 | b = b[j .. dim + j]; |
|---|
| | 659 | T[] c = new T[dim + j]; |
|---|
| | 660 | c = c[j .. dim + j]; |
|---|
| | 661 | |
|---|
| | 662 | for (int i = 0; i < dim; i++) |
|---|
| | 663 | { a[i] = cast(T)i; |
|---|
| | 664 | b[i] = cast(T)(i + 7); |
|---|
| | 665 | c[i] = cast(T)(i * 2); |
|---|
| | 666 | } |
|---|
| | 667 | |
|---|
| | 668 | c[] = a[] - 6; |
|---|
| | 669 | |
|---|
| | 670 | for (int i = 0; i < dim; i++) |
|---|
| | 671 | { |
|---|
| | 672 | if (c[i] != cast(T)(a[i] - 6)) |
|---|
| | 673 | { |
|---|
| | 674 | printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); |
|---|
| | 675 | assert(0); |
|---|
| | 676 | } |
|---|
| | 677 | } |
|---|
| | 678 | } |
|---|
| | 679 | } |
|---|
| | 680 | } |
|---|
| | 681 | |
|---|
| | 682 | /* ======================================================================== */ |
|---|
| | 683 | |
|---|
| | 684 | /*********************** |
|---|
| | 685 | * Computes: |
|---|
| | 686 | * a[] = value - b[] |
|---|
| | 687 | */ |
|---|
| | 688 | |
|---|
| | 689 | T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) |
|---|
| | 690 | in |
|---|
| | 691 | { |
|---|
| | 692 | assert (a.length == b.length); |
|---|
| | 693 | assert (disjoint(a, b)); |
|---|
| | 694 | } |
|---|
| | 695 | body |
|---|
| | 696 | { |
|---|
| | 697 | //printf("_arrayExpSliceMinSliceAssign_d()\n"); |
|---|
| | 698 | auto aptr = a.ptr; |
|---|
| | 699 | auto aend = aptr + a.length; |
|---|
| | 700 | auto bptr = b.ptr; |
|---|
| | 701 | |
|---|
| | 702 | version (D_InlineAsm_X86) |
|---|
| | 703 | { |
|---|
| | 704 | // SSE2 version is 66% faster |
|---|
| | 705 | if (sse2() && a.length >= 8) |
|---|
| | 706 | { |
|---|
| | 707 | auto n = aptr + (a.length & ~7); |
|---|
| | 708 | |
|---|
| | 709 | // Unaligned case |
|---|
| | 710 | asm |
|---|
| | 711 | { |
|---|
| | 712 | mov EAX, bptr; |
|---|
| | 713 | mov ESI, aptr; |
|---|
| | 714 | mov EDI, n; |
|---|
| | 715 | movsd XMM4, value; |
|---|
| | 716 | shufpd XMM4, XMM4, 0; |
|---|
| | 717 | |
|---|
| | 718 | align 8; |
|---|
| | 719 | startsseloop: |
|---|
| | 720 | add ESI, 64; |
|---|
| | 721 | movapd XMM5, XMM4; |
|---|
| | 722 | movapd XMM6, XMM4; |
|---|
| | 723 | movupd XMM0, [EAX]; |
|---|
| | 724 | movupd XMM1, [EAX+16]; |
|---|
| | 725 | movupd XMM2, [EAX+32]; |
|---|
| | 726 | movupd XMM3, [EAX+48]; |
|---|
| | 727 | add EAX, 64; |
|---|
| | 728 | subpd XMM5, XMM0; |
|---|
| | 729 | subpd XMM6, XMM1; |
|---|
| | 730 | movupd [ESI+ 0-64], XMM5; |
|---|
| | 731 | movupd [ESI+16-64], XMM6; |
|---|
| | 732 | movapd XMM5, XMM4; |
|---|
| | 733 | movapd XMM6, XMM4; |
|---|
| | 734 | subpd XMM5, XMM2; |
|---|
| | 735 | subpd XMM6, XMM3; |
|---|
| | 736 | movupd [ESI+32-64], XMM5; |
|---|
| | 737 | movupd [ESI+48-64], XMM6; |
|---|
| | 738 | cmp ESI, EDI; |
|---|
| | 739 | jb startsseloop; |
|---|
| | 740 | |
|---|
| | 741 | mov aptr, ESI; |
|---|
| | 742 | mov bptr, EAX; |
|---|
| | 743 | } |
|---|
| | 744 | } |
|---|
| | 745 | } |
|---|
| | 746 | |
|---|
| | 747 | while (aptr < aend) |
|---|
| | 748 | *aptr++ = value - *bptr++; |
|---|
| | 749 | |
|---|
| | 750 | return a; |
|---|
| | 751 | } |
|---|
| | 752 | |
|---|
| | 753 | unittest |
|---|
| | 754 | { |
|---|
| | 755 | printf("_arrayExpSliceMinSliceAssign_d unittest\n"); |
|---|
| | 756 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 757 | { |
|---|
| | 758 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 759 | |
|---|
| | 760 | for (int j = 0; j < 2; j++) |
|---|
| | 761 | { |
|---|
| | 762 | const int dim = 67; |
|---|
| | 763 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 764 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 765 | T[] b = new T[dim + j]; |
|---|
| | 766 | b = b[j .. dim + j]; |
|---|
| | 767 | T[] c = new T[dim + j]; |
|---|
| | 768 | c = c[j .. dim + j]; |
|---|
| | 769 | |
|---|
| | 770 | for (int i = 0; i < dim; i++) |
|---|
| | 771 | { a[i] = cast(T)i; |
|---|
| | 772 | b[i] = cast(T)(i + 7); |
|---|
| | 773 | c[i] = cast(T)(i * 2); |
|---|
| | 774 | } |
|---|
| | 775 | |
|---|
| | 776 | c[] = 6 - a[]; |
|---|
| | 777 | |
|---|
| | 778 | for (int i = 0; i < dim; i++) |
|---|
| | 779 | { |
|---|
| | 780 | if (c[i] != cast(T)(6 - a[i])) |
|---|
| | 781 | { |
|---|
| | 782 | printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); |
|---|
| | 783 | assert(0); |
|---|
| | 784 | } |
|---|
| | 785 | } |
|---|
| | 786 | } |
|---|
| | 787 | } |
|---|
| | 788 | } |
|---|
| | 789 | |
|---|
| | 790 | /* ======================================================================== */ |
|---|
| | 791 | |
|---|
| | 792 | /*********************** |
|---|
| | 793 | * Computes: |
|---|
| | 794 | * a[] -= value |
|---|
| | 795 | */ |
|---|
| | 796 | |
|---|
| | 797 | T[] _arrayExpSliceMinass_d(T[] a, T value) |
|---|
| | 798 | { |
|---|
| | 799 | //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); |
|---|
| | 800 | auto aptr = a.ptr; |
|---|
| | 801 | auto aend = aptr + a.length; |
|---|
| | 802 | |
|---|
| | 803 | version (D_InlineAsm_X86) |
|---|
| | 804 | { |
|---|
| | 805 | // SSE2 version is 115% faster |
|---|
| | 806 | if (sse2() && a.length >= 8) |
|---|
| | 807 | { |
|---|
| | 808 | auto n = cast(T*)((cast(uint)aend) & ~7); |
|---|
| | 809 | if (aptr < n) |
|---|
| | 810 | |
|---|
| | 811 | // Unaligned case |
|---|
| | 812 | asm |
|---|
| | 813 | { |
|---|
| | 814 | mov ESI, aptr; |
|---|
| | 815 | mov EDI, n; |
|---|
| | 816 | movsd XMM4, value; |
|---|
| | 817 | shufpd XMM4, XMM4, 0; |
|---|
| | 818 | |
|---|
| | 819 | align 8; |
|---|
| | 820 | startsseloopa: |
|---|
| | 821 | movupd XMM0, [ESI]; |
|---|
| | 822 | movupd XMM1, [ESI+16]; |
|---|
| | 823 | movupd XMM2, [ESI+32]; |
|---|
| | 824 | movupd XMM3, [ESI+48]; |
|---|
| | 825 | add ESI, 64; |
|---|
| | 826 | subpd XMM0, XMM4; |
|---|
| | 827 | subpd XMM1, XMM4; |
|---|
| | 828 | subpd XMM2, XMM4; |
|---|
| | 829 | subpd XMM3, XMM4; |
|---|
| | 830 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 831 | movupd [ESI+16-64], XMM1; |
|---|
| | 832 | movupd [ESI+32-64], XMM2; |
|---|
| | 833 | movupd [ESI+48-64], XMM3; |
|---|
| | 834 | cmp ESI, EDI; |
|---|
| | 835 | jb startsseloopa; |
|---|
| | 836 | |
|---|
| | 837 | mov aptr, ESI; |
|---|
| | 838 | } |
|---|
| | 839 | } |
|---|
| | 840 | } |
|---|
| | 841 | |
|---|
| | 842 | while (aptr < aend) |
|---|
| | 843 | *aptr++ -= value; |
|---|
| | 844 | |
|---|
| | 845 | return a; |
|---|
| | 846 | } |
|---|
| | 847 | |
|---|
| | 848 | unittest |
|---|
| | 849 | { |
|---|
| | 850 | printf("_arrayExpSliceMinass_d unittest\n"); |
|---|
| | 851 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 852 | { |
|---|
| | 853 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 854 | |
|---|
| | 855 | for (int j = 0; j < 2; j++) |
|---|
| | 856 | { |
|---|
| | 857 | const int dim = 67; |
|---|
| | 858 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 859 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 860 | T[] b = new T[dim + j]; |
|---|
| | 861 | b = b[j .. dim + j]; |
|---|
| | 862 | T[] c = new T[dim + j]; |
|---|
| | 863 | c = c[j .. dim + j]; |
|---|
| | 864 | |
|---|
| | 865 | for (int i = 0; i < dim; i++) |
|---|
| | 866 | { a[i] = cast(T)i; |
|---|
| | 867 | b[i] = cast(T)(i + 7); |
|---|
| | 868 | c[i] = cast(T)(i * 2); |
|---|
| | 869 | } |
|---|
| | 870 | |
|---|
| | 871 | a[] = c[]; |
|---|
| | 872 | c[] -= 6; |
|---|
| | 873 | |
|---|
| | 874 | for (int i = 0; i < dim; i++) |
|---|
| | 875 | { |
|---|
| | 876 | if (c[i] != cast(T)(a[i] - 6)) |
|---|
| | 877 | { |
|---|
| | 878 | printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); |
|---|
| | 879 | assert(0); |
|---|
| | 880 | } |
|---|
| | 881 | } |
|---|
| | 882 | } |
|---|
| | 883 | } |
|---|
| | 884 | } |
|---|
| | 885 | |
|---|
| | 886 | /* ======================================================================== */ |
|---|
| | 887 | |
|---|
| | 888 | /*********************** |
|---|
| | 889 | * Computes: |
|---|
| | 890 | * a[] -= b[] |
|---|
| | 891 | */ |
|---|
| | 892 | |
|---|
| | 893 | T[] _arraySliceSliceMinass_d(T[] a, T[] b) |
|---|
| | 894 | in |
|---|
| | 895 | { |
|---|
| | 896 | assert (a.length == b.length); |
|---|
| | 897 | assert (disjoint(a, b)); |
|---|
| | 898 | } |
|---|
| | 899 | body |
|---|
| | 900 | { |
|---|
| | 901 | //printf("_arraySliceSliceMinass_d()\n"); |
|---|
| | 902 | auto aptr = a.ptr; |
|---|
| | 903 | auto aend = aptr + a.length; |
|---|
| | 904 | auto bptr = b.ptr; |
|---|
| | 905 | |
|---|
| | 906 | version (D_InlineAsm_X86) |
|---|
| | 907 | { |
|---|
| | 908 | // SSE2 version is 183% faster |
|---|
| | 909 | if (sse2() && a.length >= 8) |
|---|
| | 910 | { |
|---|
| | 911 | auto n = aptr + (a.length & ~7); |
|---|
| | 912 | |
|---|
| | 913 | // Unaligned case |
|---|
| | 914 | asm |
|---|
| | 915 | { |
|---|
| | 916 | mov ECX, bptr; // right operand |
|---|
| | 917 | mov ESI, aptr; // destination operand |
|---|
| | 918 | mov EDI, n; // end comparison |
|---|
| | 919 | |
|---|
| | 920 | align 8; |
|---|
| | 921 | startsseloopb: |
|---|
| | 922 | movupd XMM0, [ESI]; |
|---|
| | 923 | movupd XMM1, [ESI+16]; |
|---|
| | 924 | movupd XMM2, [ESI+32]; |
|---|
| | 925 | movupd XMM3, [ESI+48]; |
|---|
| | 926 | add ESI, 64; |
|---|
| | 927 | movupd XMM4, [ECX]; |
|---|
| | 928 | movupd XMM5, [ECX+16]; |
|---|
| | 929 | movupd XMM6, [ECX+32]; |
|---|
| | 930 | movupd XMM7, [ECX+48]; |
|---|
| | 931 | add ECX, 64; |
|---|
| | 932 | subpd XMM0, XMM4; |
|---|
| | 933 | subpd XMM1, XMM5; |
|---|
| | 934 | subpd XMM2, XMM6; |
|---|
| | 935 | subpd XMM3, XMM7; |
|---|
| | 936 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 937 | movupd [ESI+16-64], XMM1; |
|---|
| | 938 | movupd [ESI+32-64], XMM2; |
|---|
| | 939 | movupd [ESI+48-64], XMM3; |
|---|
| | 940 | cmp ESI, EDI; |
|---|
| | 941 | jb startsseloopb; |
|---|
| | 942 | |
|---|
| | 943 | mov aptr, ESI; |
|---|
| | 944 | mov bptr, ECX; |
|---|
| | 945 | } |
|---|
| | 946 | } |
|---|
| | 947 | } |
|---|
| | 948 | |
|---|
| | 949 | while (aptr < aend) |
|---|
| | 950 | *aptr++ -= *bptr++; |
|---|
| | 951 | |
|---|
| | 952 | return a; |
|---|
| | 953 | } |
|---|
| | 954 | |
|---|
| | 955 | unittest |
|---|
| | 956 | { |
|---|
| | 957 | printf("_arrayExpSliceMinass_d unittest\n"); |
|---|
| | 958 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 959 | { |
|---|
| | 960 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 961 | |
|---|
| | 962 | for (int j = 0; j < 2; j++) |
|---|
| | 963 | { |
|---|
| | 964 | const int dim = 67; |
|---|
| | 965 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 966 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 967 | T[] b = new T[dim + j]; |
|---|
| | 968 | b = b[j .. dim + j]; |
|---|
| | 969 | T[] c = new T[dim + j]; |
|---|
| | 970 | c = c[j .. dim + j]; |
|---|
| | 971 | |
|---|
| | 972 | for (int i = 0; i < dim; i++) |
|---|
| | 973 | { a[i] = cast(T)i; |
|---|
| | 974 | b[i] = cast(T)(i + 7); |
|---|
| | 975 | c[i] = cast(T)(i * 2); |
|---|
| | 976 | } |
|---|
| | 977 | |
|---|
| | 978 | a[] = c[]; |
|---|
| | 979 | c[] -= 6; |
|---|
| | 980 | |
|---|
| | 981 | for (int i = 0; i < dim; i++) |
|---|
| | 982 | { |
|---|
| | 983 | if (c[i] != cast(T)(a[i] - 6)) |
|---|
| | 984 | { |
|---|
| | 985 | printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); |
|---|
| | 986 | assert(0); |
|---|
| | 987 | } |
|---|
| | 988 | } |
|---|
| | 989 | } |
|---|
| | 990 | } |
|---|
| | 991 | } |
|---|
| | 992 | |
|---|
| | 993 | /* ======================================================================== */ |
|---|
| | 994 | |
|---|
| | 995 | /*********************** |
|---|
| | 996 | * Computes: |
|---|
| | 997 | * a[] = b[] * value |
|---|
| | 998 | */ |
|---|
| | 999 | |
|---|
| | 1000 | T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) |
|---|
| | 1001 | in |
|---|
| | 1002 | { |
|---|
| | 1003 | assert(a.length == b.length); |
|---|
| | 1004 | assert(disjoint(a, b)); |
|---|
| | 1005 | } |
|---|
| | 1006 | body |
|---|
| | 1007 | { |
|---|
| | 1008 | //printf("_arraySliceExpMulSliceAssign_d()\n"); |
|---|
| | 1009 | auto aptr = a.ptr; |
|---|
| | 1010 | auto aend = aptr + a.length; |
|---|
| | 1011 | auto bptr = b.ptr; |
|---|
| | 1012 | |
|---|
| | 1013 | version (D_InlineAsm_X86) |
|---|
| | 1014 | { |
|---|
| | 1015 | // SSE2 version is 304% faster |
|---|
| | 1016 | if (sse2() && a.length >= 8) |
|---|
| | 1017 | { |
|---|
| | 1018 | auto n = aptr + (a.length & ~7); |
|---|
| | 1019 | |
|---|
| | 1020 | // Unaligned case |
|---|
| | 1021 | asm |
|---|
| | 1022 | { |
|---|
| | 1023 | mov EAX, bptr; |
|---|
| | 1024 | mov ESI, aptr; |
|---|
| | 1025 | mov EDI, n; |
|---|
| | 1026 | movsd XMM4, value; |
|---|
| | 1027 | shufpd XMM4, XMM4, 0; |
|---|
| | 1028 | |
|---|
| | 1029 | align 8; |
|---|
| | 1030 | startsseloop: |
|---|
| | 1031 | add ESI, 64; |
|---|
| | 1032 | movupd XMM0, [EAX]; |
|---|
| | 1033 | movupd XMM1, [EAX+16]; |
|---|
| | 1034 | movupd XMM2, [EAX+32]; |
|---|
| | 1035 | movupd XMM3, [EAX+48]; |
|---|
| | 1036 | add EAX, 64; |
|---|
| | 1037 | mulpd XMM0, XMM4; |
|---|
| | 1038 | mulpd XMM1, XMM4; |
|---|
| | 1039 | mulpd XMM2, XMM4; |
|---|
| | 1040 | mulpd XMM3, XMM4; |
|---|
| | 1041 | movupd [ESI+ 0-64], XMM0; |
|---|
| | 1042 | movupd [ESI+16-64], XMM1; |
|---|
| | 1043 | movupd [ESI+32-64], XMM2; |
|---|
| | 1044 | movupd [ESI+48-64], XMM3; |
|---|
| | 1045 | cmp ESI, EDI; |
|---|
| | 1046 | jb startsseloop; |
|---|
| | 1047 | |
|---|
| | 1048 | mov aptr, ESI; |
|---|
| | 1049 | mov bptr, EAX; |
|---|
| | 1050 | } |
|---|
| | 1051 | } |
|---|
| | 1052 | } |
|---|
| | 1053 | |
|---|
| | 1054 | while (aptr < aend) |
|---|
| | 1055 | *aptr++ = *bptr++ * value; |
|---|
| | 1056 | |
|---|
| | 1057 | return a; |
|---|
| | 1058 | } |
|---|
| | 1059 | |
|---|
| | 1060 | unittest |
|---|
| | 1061 | { |
|---|
| | 1062 | printf("_arraySliceExpMulSliceAssign_d unittest\n"); |
|---|
| | 1063 | for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) |
|---|
| | 1064 | { |
|---|
| | 1065 | version (log) printf(" cpuid %d\n", cpuid); |
|---|
| | 1066 | |
|---|
| | 1067 | for (int j = 0; j < 2; j++) |
|---|
| | 1068 | { |
|---|
| | 1069 | const int dim = 67; |
|---|
| | 1070 | T[] a = new T[dim + j]; // aligned on 16 byte boundary |
|---|
| | 1071 | a = a[j .. dim + j]; // misalign for second iteration |
|---|
| | 1072 | T[] b = new T[dim + j]; |
|---|
| | 1073 | b = b[j .. dim + j]; |
|---|
| | 1074 | T[] c = new T[dim + j]; |
|---|
| | 1075 | c = c[j .. dim + j]; |
|---|
| | 1076 | &n |
|---|