[X86][SLM] WriteVecIMul instructions only take 1uop (REAPPLIED)
The xmm variant have half the throughput (and +1cy latency) of the mmx variants, but are still 1uop. I still need to do more thorough testing of SLM on test-suite before fixing the obvious bad numbers for WritePMULLD. But this helps the D103695 helper script get to more accurate numbers for vXi32 multiplies of extended operands (i.e. we can use PMADDWD, PMULLW/PMULHW etc). Matches what Intel AoM / Agner / llvm-exegesis reports.
This commit is contained in:
parent
ac51d69208
commit
2005ae15a6
|
@ -372,8 +372,8 @@ defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>;
|
|||
defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>;
|
||||
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
|
||||
defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
|
||||
defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2], 2>;
|
||||
defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2], 2>;
|
||||
defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2]>;
|
||||
defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2]>;
|
||||
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
|
||||
// FIXME: The below is closer to correct, but caused some perf regressions.
|
||||
//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
|
||||
|
|
|
@ -537,40 +537,40 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
|
|||
define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
|
||||
; SLM32-LABEL: test_mul_v16i32_v16i16:
|
||||
; SLM32: # %bb.0:
|
||||
; SLM32-NEXT: movdqa %xmm0, %xmm4
|
||||
; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
|
||||
; SLM32-NEXT: movdqa %xmm1, %xmm3
|
||||
; SLM32-NEXT: movdqa %xmm4, %xmm2
|
||||
; SLM32-NEXT: pmullw %xmm0, %xmm4
|
||||
; SLM32-NEXT: movdqa %xmm0, %xmm1
|
||||
; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
|
||||
; SLM32-NEXT: movdqa %xmm1, %xmm2
|
||||
; SLM32-NEXT: movdqa %xmm3, %xmm4
|
||||
; SLM32-NEXT: pmullw %xmm0, %xmm1
|
||||
; SLM32-NEXT: pmulhuw %xmm0, %xmm2
|
||||
; SLM32-NEXT: pmullw %xmm0, %xmm3
|
||||
; SLM32-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SLM32-NEXT: movdqa %xmm4, %xmm0
|
||||
; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
|
||||
; SLM32-NEXT: pmulhuw %xmm0, %xmm4
|
||||
; SLM32-NEXT: movdqa %xmm1, %xmm0
|
||||
; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
||||
; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; SLM32-NEXT: movdqa %xmm3, %xmm2
|
||||
; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
|
||||
; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; SLM32-NEXT: movdqa %xmm4, %xmm1
|
||||
; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
|
||||
; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
|
||||
; SLM32-NEXT: retl
|
||||
;
|
||||
; SLM64-LABEL: test_mul_v16i32_v16i16:
|
||||
; SLM64: # %bb.0:
|
||||
; SLM64-NEXT: movdqa %xmm0, %xmm4
|
||||
; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
|
||||
; SLM64-NEXT: movdqa %xmm1, %xmm3
|
||||
; SLM64-NEXT: movdqa %xmm4, %xmm2
|
||||
; SLM64-NEXT: pmullw %xmm0, %xmm4
|
||||
; SLM64-NEXT: movdqa %xmm0, %xmm1
|
||||
; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
|
||||
; SLM64-NEXT: movdqa %xmm1, %xmm2
|
||||
; SLM64-NEXT: movdqa %xmm3, %xmm4
|
||||
; SLM64-NEXT: pmullw %xmm0, %xmm1
|
||||
; SLM64-NEXT: pmulhuw %xmm0, %xmm2
|
||||
; SLM64-NEXT: pmullw %xmm0, %xmm3
|
||||
; SLM64-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SLM64-NEXT: movdqa %xmm4, %xmm0
|
||||
; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
|
||||
; SLM64-NEXT: pmulhuw %xmm0, %xmm4
|
||||
; SLM64-NEXT: movdqa %xmm1, %xmm0
|
||||
; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
||||
; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; SLM64-NEXT: movdqa %xmm3, %xmm2
|
||||
; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
|
||||
; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
|
||||
; SLM64-NEXT: movdqa %xmm4, %xmm1
|
||||
; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
|
||||
; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
|
||||
; SLM64-NEXT: retq
|
||||
;
|
||||
; SLOW32-LABEL: test_mul_v16i32_v16i16:
|
||||
|
|
|
@ -563,8 +563,8 @@ xorpd (%rax), %xmm2
|
|||
# CHECK-NEXT: 1 1 1.00 pextrw $1, %xmm0, %ecx
|
||||
# CHECK-NEXT: 1 1 1.00 pinsrw $1, %eax, %xmm0
|
||||
# CHECK-NEXT: 1 4 1.00 * pinsrw $1, (%rax), %xmm0
|
||||
# CHECK-NEXT: 2 5 2.00 pmaddwd %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmaddwd (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmaddwd %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmaddwd (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 1 0.50 pmaxsw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 * pmaxsw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 1 0.50 pmaxub %xmm0, %xmm2
|
||||
|
@ -574,16 +574,16 @@ xorpd (%rax), %xmm2
|
|||
# CHECK-NEXT: 1 1 0.50 pminub %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 * pminub (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 pmovmskb %xmm0, %ecx
|
||||
# CHECK-NEXT: 2 5 2.00 pmulhuw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmulhuw (%rax), %xmm2
|
||||
# CHECK-NEXT: 2 5 2.00 pmulhw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmulhw (%rax), %xmm2
|
||||
# CHECK-NEXT: 2 5 2.00 pmullw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmullw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmulhuw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmulhuw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmulhw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmulhw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmullw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmullw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 pmuludq %mm0, %mm2
|
||||
# CHECK-NEXT: 1 7 1.00 * pmuludq (%rax), %mm2
|
||||
# CHECK-NEXT: 2 5 2.00 pmuludq %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmuludq (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmuludq %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmuludq (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 1 0.50 por %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 * por (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 psadbw %xmm0, %xmm2
|
||||
|
|
|
@ -237,8 +237,8 @@ roundss $1, (%rax), %xmm2
|
|||
# CHECK-NEXT: 1 4 1.00 * pmovzxwd (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 1 1.00 pmovzxwq %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 * pmovzxwq (%rax), %xmm2
|
||||
# CHECK-NEXT: 2 5 2.00 pmuldq %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmuldq (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmuldq %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmuldq (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 pmulld %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 7 1.00 * pmulld (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 1 0.50 ptest %xmm0, %xmm1
|
||||
|
|
|
@ -148,12 +148,12 @@ psignw (%rax), %xmm2
|
|||
# CHECK-NEXT: 1 4 1.00 * phsubw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 pmaddubsw %mm0, %mm2
|
||||
# CHECK-NEXT: 1 7 1.00 * pmaddubsw (%rax), %mm2
|
||||
# CHECK-NEXT: 2 5 2.00 pmaddubsw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmaddubsw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmaddubsw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmaddubsw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 4 1.00 pmulhrsw %mm0, %mm2
|
||||
# CHECK-NEXT: 1 7 1.00 * pmulhrsw (%rax), %mm2
|
||||
# CHECK-NEXT: 2 5 2.00 pmulhrsw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 2 8 2.00 * pmulhrsw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 5 2.00 pmulhrsw %xmm0, %xmm2
|
||||
# CHECK-NEXT: 1 8 2.00 * pmulhrsw (%rax), %xmm2
|
||||
# CHECK-NEXT: 1 1 1.00 pshufb %mm0, %mm2
|
||||
# CHECK-NEXT: 1 4 1.00 * pshufb (%rax), %mm2
|
||||
# CHECK-NEXT: 4 5 5.00 pshufb %xmm0, %xmm2
|
||||
|
|
Loading…
Reference in a new issue