From c6045adbf8f9092acc8c273946a3398f6309b80e Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 16 Dec 2022 14:05:39 -0800 Subject: [PATCH] Ensure that `TryGetContainableHWIntrinsicOp` is non-mutating (#79363) * Ensure that `TryGetContainableHWIntrinsicOp` is non-mutating * Applying formatting patch * Ensure BuildOperandUses handles CreateScalarUnsafe being contained * Ensure CreateScalarUnsafe is only contained for regOptional when the op1 type is a floating-point * Directly check that op1 is contained/regOptional * Ensure FusedMultiplyAdd rechecks CreateScalarUnsafe containment after removing NEG nodes * Ensure BroadcastScalarToVector doesn't regress containable scenarios * Applying formatting patch --- src/coreclr/jit/gentree.cpp | 7 + src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 38 ++- src/coreclr/jit/instr.cpp | 37 ++- src/coreclr/jit/lower.h | 6 +- src/coreclr/jit/lowerxarch.cpp | 299 ++++++++++---------- src/coreclr/jit/lsrabuild.cpp | 3 +- 6 files changed, 228 insertions(+), 162 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1999b79c70e52..dccd27a9615be 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18817,6 +18817,13 @@ bool GenTree::isContainableHWIntrinsic() const return true; } + case NI_Vector128_CreateScalarUnsafe: + case NI_Vector256_CreateScalarUnsafe: + { + // These HWIntrinsic operations are contained as part of scalar ops + return true; + } + case NI_Vector128_get_Zero: case NI_Vector256_get_Zero: { diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index b2b08c8d828c8..69474169db631 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -56,10 +56,9 @@ static void assertIsContainableHWIntrinsicOp(Lowering* lowering, } bool supportsRegOptional = false; - bool isContainable = lowering->TryGetContainableHWIntrinsicOp(containingNode, &node, &supportsRegOptional); + bool isContainable = lowering->IsContainableHWIntrinsicOp(containingNode, node, &supportsRegOptional); assert(isContainable || supportsRegOptional); - assert(node == containedNode); #endif // DEBUG } @@ -482,15 +481,40 @@ void CodeGen::genHWIntrinsic_R_RM( break; case OperandKind::Reg: + { + regNumber rmOpReg = rmOpDesc.GetReg(); + if (emit->IsMovInstruction(ins)) { - emit->emitIns_Mov(ins, attr, reg, rmOp->GetRegNum(), /* canSkip */ false); + emit->emitIns_Mov(ins, attr, reg, rmOpReg, /* canSkip */ false); } else { - emit->emitIns_R_R(ins, attr, reg, rmOp->GetRegNum()); + if (varTypeIsIntegral(rmOp) && ((node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector128) || + (node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector256))) + { + // In lowering we had the special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) + // + // This is one of the only instructions where it supports taking integer types from + // a SIMD register or directly as a scalar from memory. Most other instructions, in + // comparison, take such values from general-purpose registers instead. + // + // Because of this, we removed the CreateScalarUnsafe and tried to contain op1 directly + // that failed and we either didn't get marked regOptional or we did and didn't get spilled + // + // As such, we need to emulate the removed CreateScalarUnsafe to ensure that op1 is in a + // SIMD register so the broadcast instruction can execute succesfully. We'll just move + // the value into the target register and then broadcast it out from that. + + emitAttr movdAttr = emitActualTypeSize(node->GetSimdBaseType()); + emit->emitIns_Mov(INS_movd, movdAttr, reg, rmOpReg, /* canSkip */ false); + rmOpReg = reg; + } + + emit->emitIns_R_R(ins, attr, reg, rmOpReg); } break; + } default: unreached(); @@ -637,7 +661,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, case OperandKind::Reg: { - regNumber op2Reg = op2->GetRegNum(); + regNumber op2Reg = op2Desc.GetReg(); if ((op1Reg != targetReg) && (op2Reg == targetReg) && node->isRMWHWIntrinsic(compiler)) { @@ -715,7 +739,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, break; case OperandKind::Reg: - emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2->GetRegNum(), op3Reg); + emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Desc.GetReg(), op3Reg); break; default: @@ -767,7 +791,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM( break; case OperandKind::Reg: - emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3->GetRegNum()); + emit->emitIns_SIMD_R_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, op3Desc.GetReg()); break; default: diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 406becd21fb85..5911dcdf7ddaa 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -702,10 +702,37 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) } else { + assert(op->OperIsHWIntrinsic()); + #if defined(FEATURE_HW_INTRINSICS) - assert(op->AsHWIntrinsic()->OperIsMemoryLoad()); - assert(op->AsHWIntrinsic()->GetOperandCount() == 1); - addr = op->AsHWIntrinsic()->Op(1); + GenTreeHWIntrinsic* hwintrinsic = op->AsHWIntrinsic(); + NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); + + switch (intrinsicId) + { + case NI_Vector128_CreateScalarUnsafe: + case NI_Vector256_CreateScalarUnsafe: + { + // The hwintrinsic should be contained and its + // op1 should be either contained or spilled. This + // allows us to transparently "look through" the + // CreateScalarUnsafe and treat it directly like + // a load from memory. + + assert(hwintrinsic->isContained()); + op = hwintrinsic->Op(1); + return genOperandDesc(op); + } + + default: + { + assert(hwintrinsic->OperIsMemoryLoad()); + assert(hwintrinsic->GetOperandCount() == 1); + + addr = hwintrinsic->Op(1); + break; + } + } #else unreached(); #endif // FEATURE_HW_INTRINSICS @@ -960,7 +987,7 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT break; case OperandKind::Reg: - emit->emitIns_SIMD_R_R_I(ins, attr, reg1, rmOp->GetRegNum(), ival); + emit->emitIns_SIMD_R_R_I(ins, attr, reg1, rmOpDesc.GetReg(), ival); break; default: @@ -1013,7 +1040,7 @@ void CodeGen::inst_RV_RV_TT( case OperandKind::Reg: { - regNumber op2Reg = op2->GetRegNum(); + regNumber op2Reg = op2Desc.GetReg(); if ((op1Reg != targetReg) && (op2Reg == targetReg) && isRMW) { diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index ace595ff7219f..41d3265452c67 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -478,11 +478,7 @@ class Lowering final : public Phase #endif // TARGET_ARM64 #if defined(FEATURE_HW_INTRINSICS) - // Tries to get a containable node for a given HWIntrinsic - bool TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, - GenTree** pNode, - bool* supportsRegOptional, - GenTreeHWIntrinsic* transparentParentNode = nullptr); + bool IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTree* childNode, bool* supportsRegOptional); #endif // FEATURE_HW_INTRINSICS static void TransformUnusedIndirection(GenTreeIndir* ind, Compiler* comp, BasicBlock* block); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 255f9b968caf6..5150266e972f5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -914,8 +914,8 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn bool op1SupportsRegOptional = false; bool op2SupportsRegOptional = false; - if (!TryGetContainableHWIntrinsicOp(node, &node->Op(2), &op2SupportsRegOptional) && - TryGetContainableHWIntrinsicOp(node, &node->Op(1), &op1SupportsRegOptional)) + if (!IsContainableHWIntrinsicOp(node, node->Op(2), &op2SupportsRegOptional) && + IsContainableHWIntrinsicOp(node, node->Op(1), &op1SupportsRegOptional)) { // Swap operands if op2 cannot be contained but op1 can. swapOperands = true; @@ -979,16 +979,26 @@ void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node) { createScalarOps[0]->Op(1) = argX->gtGetOp1(); BlockRange().Remove(argX); + + createScalarOps[0]->Op(1)->ClearContained(); + ContainCheckHWIntrinsic(createScalarOps[0]); } if (argY->OperIs(GT_NEG)) { createScalarOps[1]->Op(1) = argY->gtGetOp1(); BlockRange().Remove(argY); + + createScalarOps[1]->Op(1)->ClearContained(); + ContainCheckHWIntrinsic(createScalarOps[1]); } if (argZ->OperIs(GT_NEG)) { createScalarOps[2]->Op(1) = argZ->gtGetOp1(); BlockRange().Remove(argZ); + + createScalarOps[2]->Op(1)->ClearContained(); + ContainCheckHWIntrinsic(createScalarOps[2]); + node->ChangeHWIntrinsicId(negMul ? NI_FMA_MultiplySubtractNegatedScalar : NI_FMA_MultiplySubtractScalar); } else @@ -5948,41 +5958,30 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) #ifdef FEATURE_HW_INTRINSICS //---------------------------------------------------------------------------------------------- -// TryGetContainableHWIntrinsicOp: Tries to get a containable node for a given HWIntrinsic +// IsContainableHWIntrinsicOp: Determines whether a child node is containable for a given HWIntrinsic // // Arguments: -// [In] containingNode - The hardware intrinsic node which contains 'node' -// [In/Out] pNode - The node to check and potentially replace with the containable node -// [Out] supportsRegOptional - On return, this will be true if 'containingNode' supports regOptional operands +// [In] parentNode - The hardware intrinsic node which is the parent of 'childNode' +// [In] childNode - The node to check if it can be contained by 'parentNode' +// [Out] supportsRegOptional - On return, this will be true if 'parentNode' supports 'childNode' being regOptional; // otherwise, false. -// [In] transparentParentNode - optional "transparent" intrinsic parent like CreateScalarUnsafe // // Return Value: -// true if 'node' is a containable by containingNode; otherwise, false. +// true if 'childNode' is a containable by 'parentNode'; otherwise, false. // -// When true is returned 'node' (and by extension the relevant op of 'containingNode') may be modified -// to handle special scenarios such as CreateScalarUnsafe which exist to bridge the type system with -// the actual registers. -// -// When false is returned 'node' is not modified. -// -bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, - GenTree** pNode, - bool* supportsRegOptional, - GenTreeHWIntrinsic* transparentParentNode) +bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTree* childNode, bool* supportsRegOptional) { - assert(containingNode != nullptr); - assert((pNode != nullptr) && (*pNode != nullptr)); + assert(parentNode != nullptr); + assert(childNode != nullptr); assert(supportsRegOptional != nullptr); - NamedIntrinsic containingIntrinsicId = containingNode->GetHWIntrinsicId(); - HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(containingIntrinsicId); - GenTree*& node = *pNode; + NamedIntrinsic parentIntrinsicId = parentNode->GetHWIntrinsicId(); + HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(parentIntrinsicId); - // We shouldn't have called in here if containingNode doesn't support containment - assert(HWIntrinsicInfo::SupportsContainment(containingIntrinsicId)); + // We shouldn't have called in here if parentNode doesn't support containment + assert(HWIntrinsicInfo::SupportsContainment(parentIntrinsicId)); - // containingNode supports nodes that read from an aligned memory address + // parentNode supports nodes that read from an aligned memory address // // This will generally be an explicit LoadAligned instruction and is false for // machines with VEX support when minOpts is enabled. This is because there is @@ -5993,22 +5992,22 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode // coding patterns on the managed side. bool supportsAlignedSIMDLoads = false; - // containingNode supports nodes that read from general memory + // parentNode supports nodes that read from general memory // // We currently have to assume all "general" loads are unaligned. As such, this is // generally used to determine if we can mark the node as `regOptional` in the case - // where `node` is not containable. However, this can also be used to determine whether + // where `childNode` is not containable. However, this can also be used to determine whether // we can mark other types of reads as contained (such as when directly reading a local). bool supportsGeneralLoads = false; - // containingNode supports nodes that read from a scalar memory address + // parentNode supports nodes that read from a scalar memory address // // This will generally be an explicit LoadScalar instruction but is also used to determine // whether we can read an address of type T (we don't support this when the load would // read more than sizeof(T) bytes). bool supportsSIMDScalarLoads = false; - // containingNode supports nodes that read from an unaligned memory address + // parentNode supports nodes that read from an unaligned memory address // // This will generally be an explicit Load instruction and is generally false for machines // without VEX support. This is because older hardware required that the SIMD operand always @@ -6019,13 +6018,13 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode { case HW_Category_MemoryLoad: { - supportsGeneralLoads = !node->OperIsHWIntrinsic(); + supportsGeneralLoads = !childNode->OperIsHWIntrinsic(); break; } case HW_Category_SimpleSIMD: { - switch (containingIntrinsicId) + switch (parentIntrinsicId) { case NI_SSE41_ConvertToVector128Int16: case NI_SSE41_ConvertToVector128Int32: @@ -6036,7 +6035,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode { assert(!supportsSIMDScalarLoads); - if (!containingNode->OperIsMemoryLoad()) + if (!parentNode->OperIsMemoryLoad()) { // The containable form is the one that takes a SIMD value, that may be in memory. @@ -6065,14 +6064,14 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode // * ConvertToVector256Int64 - sizeof(simdType) = 32; sizeof(baseType) = 1 | 2 | 4; // expectedSize = 16 | 8 | 4 - const unsigned sizeof_simdType = genTypeSize(containingNode->TypeGet()); - const unsigned sizeof_baseType = genTypeSize(containingNode->GetSimdBaseType()); + const unsigned sizeof_simdType = genTypeSize(parentNode->TypeGet()); + const unsigned sizeof_baseType = genTypeSize(parentNode->GetSimdBaseType()); assert((sizeof_simdType == 16) || (sizeof_simdType == 32)); assert((sizeof_baseType == 1) || (sizeof_baseType == 2) || (sizeof_baseType == 4)); const unsigned expectedSize = sizeof_simdType / (sizeof_baseType * 2); - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); assert((sizeof_simdType != 16) || (expectedSize == 8) || (expectedSize == 4) || (expectedSize == 2)); @@ -6084,7 +6083,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode else { // The memory form of this already takes a pointer and should be treated like a MemoryLoad - supportsGeneralLoads = !node->OperIsHWIntrinsic(); + supportsGeneralLoads = !childNode->OperIsHWIntrinsic(); } break; } @@ -6103,8 +6102,8 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode supportsAlignedSIMDLoads = !comp->opts.MinOpts(); supportsUnalignedSIMDLoads = true; - const unsigned expectedSize = genTypeSize(containingNode->TypeGet()) / 2; - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2; + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize); break; @@ -6125,8 +6124,8 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode supportsUnalignedSIMDLoads = true; } - const unsigned expectedSize = genTypeSize(containingNode->TypeGet()); - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned expectedSize = genTypeSize(parentNode->TypeGet()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize); break; @@ -6139,7 +6138,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode case HW_Category_IMM: { - switch (containingIntrinsicId) + switch (parentIntrinsicId) { case NI_SSE_Shuffle: case NI_SSE2_ShiftLeftLogical: @@ -6171,8 +6170,8 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode { assert(!supportsSIMDScalarLoads); - const unsigned expectedSize = genTypeSize(containingNode->GetSimdBaseType()); - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned expectedSize = genTypeSize(parentNode->GetSimdBaseType()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts(); supportsUnalignedSIMDLoads = comp->canUseVexEncoding(); @@ -6187,7 +6186,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode assert(!supportsSIMDScalarLoads); const unsigned expectedSize = 16; - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsAlignedSIMDLoads = !comp->canUseVexEncoding() || !comp->opts.MinOpts(); supportsUnalignedSIMDLoads = comp->canUseVexEncoding(); @@ -6202,9 +6201,9 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode assert(supportsAlignedSIMDLoads == false); assert(supportsUnalignedSIMDLoads == false); - if (containingNode->GetSimdBaseType() == TYP_FLOAT) + if (parentNode->GetSimdBaseType() == TYP_FLOAT) { - assert(containingIntrinsicId == NI_SSE41_Insert); + assert(parentIntrinsicId == NI_SSE41_Insert); // Sse41.Insert(V128, V128, byte) is a bit special // in that it has different behavior depending on whether the @@ -6217,9 +6216,9 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode assert(supportsGeneralLoads == false); assert(supportsSIMDScalarLoads == false); - GenTree* op1 = containingNode->Op(1); - GenTree* op2 = containingNode->Op(2); - GenTree* op3 = containingNode->Op(3); + GenTree* op1 = parentNode->Op(1); + GenTree* op2 = parentNode->Op(2); + GenTree* op3 = parentNode->Op(3); // The upper two bits of the immediate value are ignored if // op2 comes from memory. In order to support using the upper @@ -6239,11 +6238,11 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode } // We should only get here for integral nodes. - assert(varTypeIsIntegral(node->TypeGet())); + assert(varTypeIsIntegral(childNode->TypeGet())); assert(supportsSIMDScalarLoads == false); - const unsigned expectedSize = genTypeSize(containingNode->GetSimdBaseType()); - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned expectedSize = genTypeSize(parentNode->GetSimdBaseType()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsGeneralLoads = (operandSize >= expectedSize); break; @@ -6276,12 +6275,12 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode assert(supportsAlignedSIMDLoads == false); assert(supportsUnalignedSIMDLoads == false); - switch (containingIntrinsicId) + switch (parentIntrinsicId) { case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: { - if (!varTypeIsIntegral(node->TypeGet())) + if (!varTypeIsIntegral(childNode->TypeGet())) { // The floating-point overload doesn't require any special semantics supportsSIMDScalarLoads = true; @@ -6292,8 +6291,8 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode // The integral overloads only take GPR/mem assert(supportsSIMDScalarLoads == false); - const unsigned expectedSize = genTypeSize(genActualType(containingNode->GetSimdBaseType())); - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned expectedSize = genTypeSize(genActualType(parentNode->GetSimdBaseType())); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsGeneralLoads = (operandSize >= expectedSize); break; @@ -6302,7 +6301,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: { - if (!containingNode->OperIsMemoryLoad()) + if (!parentNode->OperIsMemoryLoad()) { // The containable form is the one that takes a SIMD value, that may be in memory. supportsSIMDScalarLoads = true; @@ -6311,7 +6310,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode else { // The memory form of this already takes a pointer and should be treated like a MemoryLoad - supportsGeneralLoads = !node->OperIsHWIntrinsic(); + supportsGeneralLoads = !childNode->OperIsHWIntrinsic(); } break; } @@ -6325,10 +6324,10 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode case NI_SSE2_X64_ConvertScalarToVector128Int64: case NI_SSE2_X64_ConvertScalarToVector128UInt64: { - if (!varTypeIsIntegral(node->TypeGet())) + if (!varTypeIsIntegral(childNode->TypeGet())) { // The floating-point overload doesn't require any special semantics - assert(containingIntrinsicId == NI_SSE2_ConvertScalarToVector128Double); + assert(parentIntrinsicId == NI_SSE2_ConvertScalarToVector128Double); supportsSIMDScalarLoads = true; supportsGeneralLoads = supportsSIMDScalarLoads; break; @@ -6337,8 +6336,8 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode // The integral overloads only take GPR/mem assert(supportsSIMDScalarLoads == false); - const unsigned expectedSize = genTypeSize(genActualType(containingNode->GetSimdBaseType())); - const unsigned operandSize = genTypeSize(node->TypeGet()); + const unsigned expectedSize = genTypeSize(genActualType(parentNode->GetSimdBaseType())); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); supportsGeneralLoads = (operandSize >= expectedSize); break; @@ -6357,20 +6356,20 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode case HW_Category_Scalar: { // We should only get here for integral nodes. - assert(varTypeIsIntegral(node->TypeGet())); + assert(varTypeIsIntegral(childNode->TypeGet())); assert(supportsAlignedSIMDLoads == false); assert(supportsUnalignedSIMDLoads == false); assert(supportsSIMDScalarLoads == false); - unsigned expectedSize = genTypeSize(containingNode->TypeGet()); - const unsigned operandSize = genTypeSize(node->TypeGet()); + unsigned expectedSize = genTypeSize(parentNode->TypeGet()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); // CRC32 codegen depends on its second oprand's type. // Currently, we are using SIMDBaseType to store the op2Type info. - if (containingIntrinsicId == NI_SSE42_Crc32) + if (parentIntrinsicId == NI_SSE42_Crc32) { - var_types op2Type = containingNode->GetSimdBaseType(); + var_types op2Type = parentNode->GetSimdBaseType(); expectedSize = genTypeSize(op2Type); } @@ -6388,40 +6387,28 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode } } - bool isSafeToContainMem; - - // Code motion safety checks - // - if (transparentParentNode != nullptr) - { - isSafeToContainMem = IsSafeToContainMem(containingNode, transparentParentNode, node); - } - else - { - isSafeToContainMem = IsSafeToContainMem(containingNode, node); - } + bool isSafeToContainMem = IsSafeToContainMem(parentNode, childNode); *supportsRegOptional = isSafeToContainMem && supportsGeneralLoads; - if (!node->OperIsHWIntrinsic()) + if (!childNode->OperIsHWIntrinsic()) { bool canBeContained = false; if (supportsGeneralLoads) { - if (IsContainableMemoryOp(node)) + if (IsContainableMemoryOp(childNode)) { canBeContained = isSafeToContainMem; } - else if (node->IsCnsNonZeroFltOrDbl()) + else if (childNode->IsCnsNonZeroFltOrDbl()) { // Always safe. - // canBeContained = true; } - else if (node->IsCnsVec()) + else if (childNode->IsCnsVec()) { - GenTreeVecCon* vecCon = node->AsVecCon(); + GenTreeVecCon* vecCon = childNode->AsVecCon(); canBeContained = !vecCon->IsAllBitsSet() && !vecCon->IsZero(); } } @@ -6431,7 +6418,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode // TODO-XArch: Update this to be table driven, if possible. - GenTreeHWIntrinsic* hwintrinsic = node->AsHWIntrinsic(); + GenTreeHWIntrinsic* hwintrinsic = childNode->AsHWIntrinsic(); NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); switch (intrinsicId) @@ -6441,34 +6428,37 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode { if (!supportsSIMDScalarLoads) { + // Nothing to do if the intrinsic doesn't support scalar loads return false; } - GenTree* op1 = hwintrinsic->Op(1); - bool op1SupportsRegOptional = false; + GenTree* op1 = hwintrinsic->Op(1); - if (!TryGetContainableHWIntrinsicOp(containingNode, &op1, &op1SupportsRegOptional, hwintrinsic)) + if (IsSafeToContainMem(parentNode, hwintrinsic, op1)) { - return false; - } - - LIR::Use use; - if (!BlockRange().TryGetUse(node, &use) || (use.User() != containingNode)) - { - return false; - } + if (op1->isContained()) + { + // We have CreateScalarUnsafe where the underlying scalar is contained + // As such, we can contain the CreateScalarUnsafe and consume the value + // directly in codegen. - // We have CreateScalarUnsafe where the underlying scalar is directly containable - // by containingNode. As such, we'll just remove CreateScalarUnsafe and consume - // the value directly. + return true; + } - use.ReplaceWith(op1); - BlockRange().Remove(node); + if (op1->IsRegOptional() && varTypeIsFloating(op1)) + { + // We have CreateScalarUnsafe where the underlying scalar was marked reg + // optional. As such, we can contain the CreateScalarUnsafe and consume + // the value directly in codegen. + // + // We only want to do this when op1 produces a floating-point value since that means + // it will already be in a SIMD register in the scenario it isn't spilled. - node = op1; - node->ClearContained(); + return true; + } + } - return true; + return false; } case NI_SSE_LoadAlignedVector128: @@ -6499,7 +6489,7 @@ bool Lowering::TryGetContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode default: { - assert(!node->isContainableHWIntrinsic()); + assert(!childNode->isContainableHWIntrinsic()); return false; } } @@ -6566,7 +6556,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // We want to handle GetElement still for Vector2/3 if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector256_GetElement)) { - // TODO-XArch-CQ: Ideally we would key this off of the size containingNode + // TODO-XArch-CQ: Ideally we would key this off of the size the containing node // expects vs the size node actually is or would be if spilled to the stack return; } @@ -6633,7 +6623,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // contained by the relevant store operation instead. return; } - break; } @@ -6643,12 +6632,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: - if (!varTypeIsSIMD(op1)) + { + if (node->OperIsMemoryLoad()) { ContainCheckHWIntrinsicAddr(node, op1); return; } break; + } + + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + { + if (node->OperIsMemoryLoad()) + { + ContainCheckHWIntrinsicAddr(node, op1); + return; + } + + if (varTypeIsIntegral(simdBaseType) && op1->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* childNode = op1->AsHWIntrinsic(); + NamedIntrinsic childNodeId = childNode->GetHWIntrinsicId(); + + if ((childNodeId == NI_Vector128_CreateScalarUnsafe) || + (childNodeId == NI_Vector256_CreateScalarUnsafe)) + { + // We have a very special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) + // + // This is one of the only instructions where it supports taking integer types from + // a SIMD register or directly as a scalar from memory. Most other instructions, in + // comparison, take such values from general-purpose registers instead. + // + // Because of this, we're going to remove the CreateScalarUnsafe and try to contain + // op1 directly, we'll then special case the codegen to materialize the value into a + // SIMD register in the case it is marked optional and doesn't get spilled. + + node->Op(1) = childNode->Op(1); + BlockRange().Remove(op1); + + op1 = node->Op(1); + op1->ClearContained(); + } + } + break; + } default: { @@ -6656,24 +6684,10 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } } + assert(!node->OperIsMemoryLoad()); bool supportsRegOptional = false; - if (node->OperIsMemoryLoad()) - { - // We have a few cases that can be potential memory loads - - assert((intrinsicId == NI_SSE41_ConvertToVector128Int16) || - (intrinsicId == NI_SSE41_ConvertToVector128Int32) || - (intrinsicId == NI_SSE41_ConvertToVector128Int64) || - (intrinsicId == NI_AVX2_BroadcastScalarToVector128) || - (intrinsicId == NI_AVX2_BroadcastScalarToVector256) || - (intrinsicId == NI_AVX2_ConvertToVector256Int16) || - (intrinsicId == NI_AVX2_ConvertToVector256Int32) || - (intrinsicId == NI_AVX2_ConvertToVector256Int64)); - - ContainCheckHWIntrinsicAddr(node, op1); - } - else if (TryGetContainableHWIntrinsicOp(node, &op1, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); } @@ -6729,13 +6743,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { bool supportsRegOptional = false; - if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) || (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && - TryGetContainableHWIntrinsicOp(node, &op1, &supportsRegOptional)) + IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); @@ -6781,7 +6795,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (!HWIntrinsicInfo::isImmOp(intrinsicId, op2)) { - if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } @@ -6803,7 +6817,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { // These intrinsics have op2 as an imm and op1 as a reg/mem - if (TryGetContainableHWIntrinsicOp(node, &op1, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); } @@ -6831,7 +6845,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (HWIntrinsicInfo::isImmOp(intrinsicId, op2)) { - if (TryGetContainableHWIntrinsicOp(node, &op1, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); } @@ -6840,7 +6854,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) MakeSrcRegOptional(node, op1); } } - else if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + else if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } @@ -6853,7 +6867,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AES_KeygenAssist: { - if (TryGetContainableHWIntrinsicOp(node, &op1, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); } @@ -6981,19 +6995,18 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // Set op regOptional only if none of them is containable. // Prefer to make op3 contained, - if (resultOpNum != 3 && TryGetContainableHWIntrinsicOp(node, &op3, &supportsOp3RegOptional)) + if (resultOpNum != 3 && IsContainableHWIntrinsicOp(node, op3, &supportsOp3RegOptional)) { // result = (op1 * op2) + [op3] MakeSrcContained(node, op3); } - else if (resultOpNum != 2 && - TryGetContainableHWIntrinsicOp(node, &op2, &supportsOp2RegOptional)) + else if (resultOpNum != 2 && IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { // result = (op1 * [op2]) + op3 MakeSrcContained(node, op2); } else if (resultOpNum != 1 && !HWIntrinsicInfo::CopiesUpperBits(intrinsicId) && - TryGetContainableHWIntrinsicOp(node, &op1, &supportsOp1RegOptional)) + IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { // result = ([op1] * op2) + op3 MakeSrcContained(node, op1); @@ -7023,7 +7036,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX_BlendVariable: case NI_AVX2_BlendVariable: { - if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } @@ -7036,7 +7049,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVXVNNI_MultiplyWideningAndAdd: case NI_AVXVNNI_MultiplyWideningAndAddSaturate: { - if (TryGetContainableHWIntrinsicOp(node, &op3, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op3, &supportsRegOptional)) { MakeSrcContained(node, op3); } @@ -7049,11 +7062,11 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_BMI2_MultiplyNoFlags: case NI_BMI2_X64_MultiplyNoFlags: { - if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } - else if (TryGetContainableHWIntrinsicOp(node, &op1, &supportsRegOptional)) + else if (IsContainableHWIntrinsicOp(node, op1, &supportsRegOptional)) { MakeSrcContained(node, op1); // MultiplyNoFlags is a Commutative operation, so swap the first two operands here @@ -7106,7 +7119,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_Permute2x128: case NI_PCLMULQDQ_CarrylessMultiply: { - if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } @@ -7153,7 +7166,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } } - if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) { MakeSrcContained(node, op2); } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index d5510858a2b9f..b5eb68292ffc5 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3221,8 +3221,7 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates) assert(hwintrinsic->Op(2)->IsCnsIntOrI()); } - BuildUse(hwintrinsic->Op(1), candidates); - return 1; + return BuildOperandUses(hwintrinsic->Op(1), candidates); } #endif // FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64