[X86][SLM] WriteVecIMul instructions only take 1uop (REAPPLIED)

The xmm variant have half the throughput (and +1cy latency) of the mmx variants, but are still 1uop.

I still need to do more thorough testing of SLM on test-suite before fixing the obvious bad numbers for WritePMULLD.

But this helps the D103695 helper script get to more accurate numbers for vXi32 multiplies of extended operands (i.e. we can use PMADDWD, PMULLW/PMULHW etc). Matches what Intel AoM / Agner / llvm-exegesis reports.
This commit is contained in:
Simon Pilgrim 2021-09-04 15:03:56 +01:00
parent ac51d69208
commit 2005ae15a6
5 changed files with 38 additions and 38 deletions

View file

@ -372,8 +372,8 @@ defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2], 2>;
defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2], 2>;
defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2]>;
defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
// FIXME: The below is closer to correct, but caused some perf regressions.
//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;

View file

@ -537,40 +537,40 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
; SLM32-LABEL: test_mul_v16i32_v16i16:
; SLM32: # %bb.0:
; SLM32-NEXT: movdqa %xmm0, %xmm4
; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM32-NEXT: movdqa %xmm1, %xmm3
; SLM32-NEXT: movdqa %xmm4, %xmm2
; SLM32-NEXT: pmullw %xmm0, %xmm4
; SLM32-NEXT: movdqa %xmm0, %xmm1
; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM32-NEXT: movdqa %xmm1, %xmm2
; SLM32-NEXT: movdqa %xmm3, %xmm4
; SLM32-NEXT: pmullw %xmm0, %xmm1
; SLM32-NEXT: pmulhuw %xmm0, %xmm2
; SLM32-NEXT: pmullw %xmm0, %xmm3
; SLM32-NEXT: pmulhuw %xmm0, %xmm1
; SLM32-NEXT: movdqa %xmm4, %xmm0
; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SLM32-NEXT: pmulhuw %xmm0, %xmm4
; SLM32-NEXT: movdqa %xmm1, %xmm0
; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM32-NEXT: movdqa %xmm3, %xmm2
; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SLM32-NEXT: movdqa %xmm4, %xmm1
; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v16i32_v16i16:
; SLM64: # %bb.0:
; SLM64-NEXT: movdqa %xmm0, %xmm4
; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM64-NEXT: movdqa %xmm1, %xmm3
; SLM64-NEXT: movdqa %xmm4, %xmm2
; SLM64-NEXT: pmullw %xmm0, %xmm4
; SLM64-NEXT: movdqa %xmm0, %xmm1
; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM64-NEXT: movdqa %xmm1, %xmm2
; SLM64-NEXT: movdqa %xmm3, %xmm4
; SLM64-NEXT: pmullw %xmm0, %xmm1
; SLM64-NEXT: pmulhuw %xmm0, %xmm2
; SLM64-NEXT: pmullw %xmm0, %xmm3
; SLM64-NEXT: pmulhuw %xmm0, %xmm1
; SLM64-NEXT: movdqa %xmm4, %xmm0
; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SLM64-NEXT: pmulhuw %xmm0, %xmm4
; SLM64-NEXT: movdqa %xmm1, %xmm0
; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM64-NEXT: movdqa %xmm3, %xmm2
; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SLM64-NEXT: movdqa %xmm4, %xmm1
; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v16i32_v16i16:

View file

@ -563,8 +563,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pextrw $1, %xmm0, %ecx
# CHECK-NEXT: 1 1 1.00 pinsrw $1, %eax, %xmm0
# CHECK-NEXT: 1 4 1.00 * pinsrw $1, (%rax), %xmm0
# CHECK-NEXT: 2 5 2.00 pmaddwd %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmaddwd (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmaddwd %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmaddwd (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 pmaxsw %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pmaxsw (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 pmaxub %xmm0, %xmm2
@ -574,16 +574,16 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 pminub %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pminub (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmovmskb %xmm0, %ecx
# CHECK-NEXT: 2 5 2.00 pmulhuw %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmulhuw (%rax), %xmm2
# CHECK-NEXT: 2 5 2.00 pmulhw %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmulhw (%rax), %xmm2
# CHECK-NEXT: 2 5 2.00 pmullw %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmullw (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmulhuw %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmulhuw (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmulhw %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmulhw (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmullw %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmullw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmuludq %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmuludq (%rax), %mm2
# CHECK-NEXT: 2 5 2.00 pmuludq %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmuludq (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmuludq %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmuludq (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 por %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * por (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 psadbw %xmm0, %xmm2

View file

@ -237,8 +237,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 * pmovzxwd (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pmovzxwq %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pmovzxwq (%rax), %xmm2
# CHECK-NEXT: 2 5 2.00 pmuldq %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmuldq (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmuldq %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmuldq (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmulld %xmm0, %xmm2
# CHECK-NEXT: 1 7 1.00 * pmulld (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 ptest %xmm0, %xmm1

View file

@ -148,12 +148,12 @@ psignw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 * phsubw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmaddubsw %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmaddubsw (%rax), %mm2
# CHECK-NEXT: 2 5 2.00 pmaddubsw %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmaddubsw (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmaddubsw %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmaddubsw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmulhrsw %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmulhrsw (%rax), %mm2
# CHECK-NEXT: 2 5 2.00 pmulhrsw %xmm0, %xmm2
# CHECK-NEXT: 2 8 2.00 * pmulhrsw (%rax), %xmm2
# CHECK-NEXT: 1 5 2.00 pmulhrsw %xmm0, %xmm2
# CHECK-NEXT: 1 8 2.00 * pmulhrsw (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pshufb %mm0, %mm2
# CHECK-NEXT: 1 4 1.00 * pshufb (%rax), %mm2
# CHECK-NEXT: 4 5 5.00 pshufb %xmm0, %xmm2