-
Notifications
You must be signed in to change notification settings - Fork 15.6k
[VectorCombine] Support multiple uses of shuffled selects in foldShuffleOfSelects #173166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
472db88 to
7ff526d
Compare
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Marcell Leleszi (mleleszi) ChangesThis patch removes the single-use restriction of selects in foldShuffleOfSelects, allowing the fold to trigger for multi-use instructions as well if the cost model finds it cheaper. Full diff: https://github.com/llvm/llvm-project/pull/173166.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9239cb1b989b2..e581c225aec6f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2547,12 +2547,14 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
ArrayRef<int> Mask;
Value *C1, *T1, *F1, *C2, *T2, *F2;
- if (!match(&I, m_Shuffle(
- m_OneUse(m_Select(m_Value(C1), m_Value(T1), m_Value(F1))),
- m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
- m_Mask(Mask))))
+ if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
+ m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
+ m_Mask(Mask))))
return false;
+ auto *Sel1 = cast<Instruction>(I.getOperand(0));
+ auto *Sel2 = cast<Instruction>(I.getOperand(1));
+
auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
@@ -2570,11 +2572,14 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
auto *DstVecTy = cast<FixedVectorType>(I.getType());
auto SK = TargetTransformInfo::SK_PermuteTwoSrc;
auto SelOp = Instruction::Select;
- InstructionCost OldCost = TTI.getCmpSelInstrCost(
+
+ InstructionCost CostSel1 = TTI.getCmpSelInstrCost(
SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
- OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- OldCost +=
+ InstructionCost CostSel2 = TTI.getCmpSelInstrCost(
+ SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+ InstructionCost OldCost =
+ CostSel1 + CostSel2 +
TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
{I.getOperand(0), I.getOperand(1)}, &I);
@@ -2590,6 +2595,11 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ if (!Sel1->hasOneUse())
+ NewCost += CostSel1;
+ if (!Sel2->hasOneUse())
+ NewCost += CostSel2;
+
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
index 7883eb42aefac..cf57a503c2197 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
@@ -637,3 +637,87 @@ define <4 x i32> @src_v2tov4_i32_change_to_other_vector(<2 x i1> %a, <2 x i1> %b
%res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
ret <4 x i32> %res
}
+
+define <4 x i32> @src_v2tov4_i32_multiuse_sel1(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %p) {
+; CHECK-LABEL: define <4 x i32> @src_v2tov4_i32_multiuse_sel1(
+; CHECK-SAME: <2 x i1> [[A:%.*]], <2 x i1> [[B:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <2 x i1> [[A]], <2 x i32> [[X]], <2 x i32> [[Z]]
+; CHECK-NEXT: store <2 x i32> [[SELECT_XZ]], ptr [[P]], align 8
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <2 x i1> [[B]], <2 x i32> [[Y]], <2 x i32> [[X]]
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i32> [[SELECT_XZ]], <2 x i32> [[SELECT_YX]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <4 x i32> [[RES]]
+;
+ %select.xz = select <2 x i1> %a, <2 x i32> %x, <2 x i32> %z
+ store <2 x i32> %select.xz, ptr %p
+ %select.yx = select <2 x i1> %b, <2 x i32> %y, <2 x i32> %x
+ %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @src_v2tov4_i32_multiuse_sel2(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %p) {
+; CHECK-LABEL: define <4 x i32> @src_v2tov4_i32_multiuse_sel2(
+; CHECK-SAME: <2 x i1> [[A:%.*]], <2 x i1> [[B:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <2 x i1> [[A]], <2 x i32> [[X]], <2 x i32> [[Z]]
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <2 x i1> [[B]], <2 x i32> [[Y]], <2 x i32> [[X]]
+; CHECK-NEXT: store <2 x i32> [[SELECT_YX]], ptr [[P]], align 8
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i32> [[SELECT_XZ]], <2 x i32> [[SELECT_YX]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <4 x i32> [[RES]]
+;
+ %select.xz = select <2 x i1> %a, <2 x i32> %x, <2 x i32> %z
+ %select.yx = select <2 x i1> %b, <2 x i32> %y, <2 x i32> %x
+ store <2 x i32> %select.yx, ptr %p
+ %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @src_v2tov4_i32_multiuse_both(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %p1, ptr %p2) {
+; CHECK-LABEL: define <4 x i32> @src_v2tov4_i32_multiuse_both(
+; CHECK-SAME: <2 x i1> [[A:%.*]], <2 x i1> [[B:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <2 x i1> [[A]], <2 x i32> [[X]], <2 x i32> [[Z]]
+; CHECK-NEXT: store <2 x i32> [[SELECT_XZ]], ptr [[P1]], align 8
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <2 x i1> [[B]], <2 x i32> [[Y]], <2 x i32> [[X]]
+; CHECK-NEXT: store <2 x i32> [[SELECT_YX]], ptr [[P2]], align 8
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i32> [[SELECT_XZ]], <2 x i32> [[SELECT_YX]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <4 x i32> [[RES]]
+;
+ %select.xz = select <2 x i1> %a, <2 x i32> %x, <2 x i32> %z
+ store <2 x i32> %select.xz, ptr %p1
+ %select.yx = select <2 x i1> %b, <2 x i32> %y, <2 x i32> %x
+ store <2 x i32> %select.yx, ptr %p2
+ %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(<1024 x i1> %a, <1024 x i1> %b, <1024 x i32> %x, <1024 x i32> %y, <1024 x i32> %z, ptr %p1, ptr %p2) {
+; SSE-LABEL: define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(
+; SSE-SAME: <1024 x i1> [[A:%.*]], <1024 x i1> [[B:%.*]], <1024 x i32> [[X:%.*]], <1024 x i32> [[Y:%.*]], <1024 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[SELECT_XZ:%.*]] = select <1024 x i1> [[A]], <1024 x i32> [[X]], <1024 x i32> [[Z]]
+; SSE-NEXT: store <1024 x i32> [[SELECT_XZ]], ptr [[P1]], align 4096
+; SSE-NEXT: [[SELECT_YX:%.*]] = select <1024 x i1> [[B]], <1024 x i32> [[Y]], <1024 x i32> [[X]]
+; SSE-NEXT: [[RES:%.*]] = shufflevector <1024 x i32> [[SELECT_XZ]], <1024 x i32> [[SELECT_YX]], <2 x i32> <i32 0, i32 1024>
+; SSE-NEXT: ret <2 x i32> [[RES]]
+;
+; AVX2-LABEL: define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(
+; AVX2-SAME: <1024 x i1> [[A:%.*]], <1024 x i1> [[B:%.*]], <1024 x i32> [[X:%.*]], <1024 x i32> [[Y:%.*]], <1024 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select <1024 x i1> [[A]], <1024 x i32> [[X]], <1024 x i32> [[Z]]
+; AVX2-NEXT: store <1024 x i32> [[SELECT_XZ]], ptr [[P1]], align 4096
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select <1024 x i1> [[B]], <1024 x i32> [[Y]], <1024 x i32> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <1024 x i32> [[SELECT_XZ]], <1024 x i32> [[SELECT_YX]], <2 x i32> <i32 0, i32 1024>
+; AVX2-NEXT: ret <2 x i32> [[RES]]
+;
+; AVX512-LABEL: define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(
+; AVX512-SAME: <1024 x i1> [[A:%.*]], <1024 x i1> [[B:%.*]], <1024 x i32> [[X:%.*]], <1024 x i32> [[Y:%.*]], <1024 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <1024 x i1> [[A]], <1024 x i32> [[X]], <1024 x i32> [[Z]]
+; AVX512-NEXT: store <1024 x i32> [[SELECT_XZ]], ptr [[P1]], align 4096
+; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <1024 x i1> [[A]], <1024 x i1> [[B]], <2 x i32> <i32 0, i32 1024>
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <1024 x i32> [[X]], <1024 x i32> [[Y]], <2 x i32> <i32 0, i32 1024>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <1024 x i32> [[Z]], <1024 x i32> [[X]], <2 x i32> <i32 0, i32 1024>
+; AVX512-NEXT: [[RES:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]]
+; AVX512-NEXT: ret <2 x i32> [[RES]]
+;
+ %select.xz = select <1024 x i1> %a, <1024 x i32> %x, <1024 x i32> %z
+ store <1024 x i32> %select.xz, ptr %p1
+ %select.yx = select <1024 x i1> %b, <1024 x i32> %y, <1024 x i32> %x
+ %res = shufflevector <1024 x i32> %select.xz, <1024 x i32> %select.yx, <2 x i32> <i32 0, i32 1024>
+ ret <2 x i32> %res
+}
|
#173036
This patch removes the single-use restriction of selects in foldShuffleOfSelects, allowing the fold to trigger for multi-use instructions as well if the cost model finds it cheaper.