DADCombiner: Don't simplify the token factor if the node's number of operands already exceeds TokenFactorInlineLimit

Summary:
  In parallelizeChainedStores, a TokenFactor was created with the size greater than 3000.
We found that DAGCombiner::visitTokenFactor will consume a huge amount of time on
such nodes. Since the number of operands already exceeds TokenFactorInlineLimit, we propose
to give up simplification with the consideration of compile time.

Reviewers:
  @spatel, @arsenm

Differential Revision:
  https://reviews.llvm.org/D84204
This commit is contained in:
Changpeng Fang 2020-07-25 21:20:59 -07:00
parent 1df8804ce5
commit 9162b70e51
2 changed files with 62 additions and 0 deletions

View file

@ -1805,6 +1805,10 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
if (OptLevel == CodeGenOpt::None)
return SDValue();
// Don't simplify the token factor if the node itself has too many operands.
if (N->getNumOperands() > TokenFactorInlineLimit)
return SDValue();
// If the sole user is a token factor, we should make sure we have a
// chance to merge them together. This prevents TF chains from inhibiting
// optimizations.

View file

@ -0,0 +1,58 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFILD %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -combiner-tokenfactor-inline-limit=7 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFIL7 %s
; GCN-LABEL: {{^}}token_factor_inline_limit_test:
; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}}
; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 v31, 7
; GCN: s_getpc
define void @token_factor_inline_limit_test() {
entry:
call void @external_void_func_8xv5i32(
<5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
<5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>,
<5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>,
<5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>,
<5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>,
<5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>,
<5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>,
<5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>)
ret void
}
declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>,
<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>)