diff options
Diffstat (limited to 'client/asmjit/x86/x86internal.cpp')
| -rw-r--r-- | client/asmjit/x86/x86internal.cpp | 1733 |
1 files changed, 1733 insertions, 0 deletions
diff --git a/client/asmjit/x86/x86internal.cpp b/client/asmjit/x86/x86internal.cpp new file mode 100644 index 0000000..062525f --- /dev/null +++ b/client/asmjit/x86/x86internal.cpp @@ -0,0 +1,1733 @@ +// AsmJit - Machine code generation for C++ +// +// * Official AsmJit Home Page: https://asmjit.com +// * Official Github Repository: https://github.com/asmjit/asmjit +// +// Copyright (c) 2008-2020 The AsmJit Authors +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgment in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#include "../core/api-build_p.h" +#ifdef ASMJIT_BUILD_X86 + +#include "../core/formatter.h" +#include "../core/string.h" +#include "../core/support.h" +#include "../core/type.h" +#include "../x86/x86internal_p.h" + +// Can be used for debugging... +// #define ASMJIT_DUMP_ARGS_ASSIGNMENT + +ASMJIT_BEGIN_SUB_NAMESPACE(x86) + +// ============================================================================ +// [asmjit::X86Internal - Helpers] +// ============================================================================ + +static ASMJIT_INLINE uint32_t x86GetXmmMovInst(const FuncFrame& frame) { + bool avx = frame.isAvxEnabled(); + bool aligned = frame.hasAlignedVecSR(); + + return aligned ? (avx ? Inst::kIdVmovaps : Inst::kIdMovaps) + : (avx ? Inst::kIdVmovups : Inst::kIdMovups); +} + +static ASMJIT_INLINE uint32_t x86VecTypeIdToRegType(uint32_t typeId) noexcept { + return typeId <= Type::_kIdVec128End ? Reg::kTypeXmm : + typeId <= Type::_kIdVec256End ? Reg::kTypeYmm : Reg::kTypeZmm; +} + +//! Converts `size` to a 'kmov?' instructio. +static inline uint32_t x86KmovFromSize(uint32_t size) noexcept { + switch (size) { + case 1: return Inst::kIdKmovb; + case 2: return Inst::kIdKmovw; + case 4: return Inst::kIdKmovd; + case 8: return Inst::kIdKmovq; + default: return Inst::kIdNone; + } +} + +// ============================================================================ +// [asmjit::X86Internal - FuncDetail] +// ============================================================================ + +ASMJIT_FAVOR_SIZE Error X86Internal::initFuncDetail(FuncDetail& func, const FuncSignature& signature, uint32_t registerSize) noexcept { + const CallConv& cc = func.callConv(); + uint32_t arch = cc.arch(); + uint32_t stackOffset = cc._spillZoneSize; + + uint32_t i; + uint32_t argCount = func.argCount(); + + if (func.retCount() != 0) { + uint32_t typeId = func._rets[0].typeId(); + switch (typeId) { + case Type::kIdI64: + case Type::kIdU64: { + if (Environment::is32Bit(arch)) { + // Convert a 64-bit return value to two 32-bit return values. + func._retCount = 2; + typeId -= 2; + + // 64-bit value is returned in EDX:EAX on X86. + func._rets[0].initReg(Reg::kTypeGpd, Gp::kIdAx, typeId); + func._rets[1].initReg(Reg::kTypeGpd, Gp::kIdDx, typeId); + break; + } + else { + func._rets[0].initReg(Reg::kTypeGpq, Gp::kIdAx, typeId); + } + break; + } + + case Type::kIdI8: + case Type::kIdI16: + case Type::kIdI32: { + func._rets[0].initReg(Reg::kTypeGpd, Gp::kIdAx, Type::kIdI32); + break; + } + + case Type::kIdU8: + case Type::kIdU16: + case Type::kIdU32: { + func._rets[0].initReg(Reg::kTypeGpd, Gp::kIdAx, Type::kIdU32); + break; + } + + case Type::kIdF32: + case Type::kIdF64: { + uint32_t regType = Environment::is32Bit(arch) ? Reg::kTypeSt : Reg::kTypeXmm; + func._rets[0].initReg(regType, 0, typeId); + break; + } + + case Type::kIdF80: { + // 80-bit floats are always returned by FP0. + func._rets[0].initReg(Reg::kTypeSt, 0, typeId); + break; + } + + case Type::kIdMmx32: + case Type::kIdMmx64: { + // MM registers are returned through XMM (SystemV) or GPQ (Win64). + uint32_t regType = Reg::kTypeMm; + if (Environment::is64Bit(arch)) + regType = cc.strategy() == CallConv::kStrategyDefault ? Reg::kTypeXmm : Reg::kTypeGpq; + + func._rets[0].initReg(regType, 0, typeId); + break; + } + + default: { + func._rets[0].initReg(x86VecTypeIdToRegType(typeId), 0, typeId); + break; + } + } + } + + switch (cc.strategy()) { + case CallConv::kStrategyDefault: { + uint32_t gpzPos = 0; + uint32_t vecPos = 0; + + for (i = 0; i < argCount; i++) { + FuncValue& arg = func._args[i]; + uint32_t typeId = arg.typeId(); + + if (Type::isInt(typeId)) { + uint32_t regId = BaseReg::kIdBad; + + if (gpzPos < CallConv::kMaxRegArgsPerGroup) + regId = cc._passedOrder[Reg::kGroupGp].id[gpzPos]; + + if (regId != BaseReg::kIdBad) { + uint32_t regType = (typeId <= Type::kIdU32) ? Reg::kTypeGpd : Reg::kTypeGpq; + arg.assignRegData(regType, regId); + func.addUsedRegs(Reg::kGroupGp, Support::bitMask(regId)); + gpzPos++; + } + else { + uint32_t size = Support::max<uint32_t>(Type::sizeOf(typeId), registerSize); + arg.assignStackOffset(int32_t(stackOffset)); + stackOffset += size; + } + continue; + } + + if (Type::isFloat(typeId) || Type::isVec(typeId)) { + uint32_t regId = BaseReg::kIdBad; + + if (vecPos < CallConv::kMaxRegArgsPerGroup) + regId = cc._passedOrder[Reg::kGroupVec].id[vecPos]; + + if (Type::isFloat(typeId)) { + // If this is a float, but `kFlagPassFloatsByVec` is false, we have + // to use stack instead. This should be only used by 32-bit calling + // conventions. + if (!cc.hasFlag(CallConv::kFlagPassFloatsByVec)) + regId = BaseReg::kIdBad; + } + else { + // Pass vector registers via stack if this is a variable arguments + // function. This should be only used by 32-bit calling conventions. + if (signature.hasVarArgs() && cc.hasFlag(CallConv::kFlagPassVecByStackIfVA)) + regId = BaseReg::kIdBad; + } + + if (regId != BaseReg::kIdBad) { + arg.initTypeId(typeId); + arg.assignRegData(x86VecTypeIdToRegType(typeId), regId); + func.addUsedRegs(Reg::kGroupVec, Support::bitMask(regId)); + vecPos++; + } + else { + uint32_t size = Type::sizeOf(typeId); + arg.assignStackOffset(int32_t(stackOffset)); + stackOffset += size; + } + continue; + } + } + break; + } + + case CallConv::kStrategyX64Windows: + case CallConv::kStrategyX64VectorCall: { + // Both X64 and VectorCall behave similarly - arguments are indexed + // from left to right. The position of the argument determines in + // which register the argument is allocated, so it's either GP or + // one of XMM/YMM/ZMM registers. + // + // [ X64 ] [VecCall] + // Index: #0 #1 #2 #3 #4 #5 + // + // GP : RCX RDX R8 R9 + // VEC : XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 + // + // For example function `f(int a, double b, int c, double d)` will be: + // + // (a) (b) (c) (d) + // RCX XMM1 R8 XMM3 + // + // Unused vector registers are used by HVA. + + bool isVectorCall = (cc.strategy() == CallConv::kStrategyX64VectorCall); + + for (i = 0; i < argCount; i++) { + FuncValue& arg = func._args[i]; + + uint32_t typeId = arg.typeId(); + uint32_t size = Type::sizeOf(typeId); + + if (Type::isInt(typeId) || Type::isMmx(typeId)) { + uint32_t regId = BaseReg::kIdBad; + + if (i < CallConv::kMaxRegArgsPerGroup) + regId = cc._passedOrder[Reg::kGroupGp].id[i]; + + if (regId != BaseReg::kIdBad) { + uint32_t regType = (size <= 4 && !Type::isMmx(typeId)) ? Reg::kTypeGpd : Reg::kTypeGpq; + arg.assignRegData(regType, regId); + func.addUsedRegs(Reg::kGroupGp, Support::bitMask(regId)); + } + else { + arg.assignStackOffset(int32_t(stackOffset)); + stackOffset += 8; + } + continue; + } + + if (Type::isFloat(typeId) || Type::isVec(typeId)) { + uint32_t regId = BaseReg::kIdBad; + + if (i < CallConv::kMaxRegArgsPerGroup) + regId = cc._passedOrder[Reg::kGroupVec].id[i]; + + if (regId != BaseReg::kIdBad) { + // X64-ABI doesn't allow vector types (XMM|YMM|ZMM) to be passed + // via registers, however, VectorCall was designed for that purpose. + if (Type::isFloat(typeId) || isVectorCall) { + uint32_t regType = x86VecTypeIdToRegType(typeId); + arg.assignRegData(regType, regId); + func.addUsedRegs(Reg::kGroupVec, Support::bitMask(regId)); + continue; + } + } + + // Passed via stack if the argument is float/double or indirectly. + // The trap is - if the argument is passed indirectly, the address + // can be passed via register, if the argument's index has GP one. + if (Type::isFloat(typeId)) { + arg.assignStackOffset(int32_t(stackOffset)); + } + else { + uint32_t gpRegId = cc._passedOrder[Reg::kGroupGp].id[i]; + if (gpRegId != BaseReg::kIdBad) + arg.assignRegData(Reg::kTypeGpq, gpRegId); + else + arg.assignStackOffset(int32_t(stackOffset)); + arg.addFlags(FuncValue::kFlagIsIndirect); + } + + // Always 8 bytes (float/double/pointer). + stackOffset += 8; + continue; + } + } + break; + } + } + + func._argStackSize = stackOffset; + return kErrorOk; +} + +// ============================================================================ +// [asmjit::X86FuncArgsContext] +// ============================================================================ + +static RegInfo x86GetRegForMemToMemMove(uint32_t arch, uint32_t dstTypeId, uint32_t srcTypeId) noexcept { + uint32_t dstSize = Type::sizeOf(dstTypeId); + uint32_t srcSize = Type::sizeOf(srcTypeId); + uint32_t maxSize = Support::max<uint32_t>(dstSize, srcSize); + uint32_t regSize = Environment::registerSizeFromArch(arch); + + uint32_t signature = 0; + if (maxSize <= regSize || (Type::isInt(dstTypeId) && Type::isInt(srcTypeId))) + signature = maxSize <= 4 ? Gpd::kSignature : Gpq::kSignature; + else if (maxSize <= 16) + signature = Xmm::kSignature; + else if (maxSize <= 32) + signature = Ymm::kSignature; + else if (maxSize <= 64) + signature = Zmm::kSignature; + + return RegInfo { signature }; +} + +// Used by both `argsToFuncFrame()` and `emitArgsAssignment()`. +class X86FuncArgsContext { +public: + enum VarId : uint32_t { + kVarIdNone = 0xFF + }; + + //! Contains information about a single argument or SA register that may need shuffling. + struct Var { + inline void init(const FuncValue& cur_, const FuncValue& out_) noexcept { + cur = cur_; + out = out_; + } + + //! Reset the value to its unassigned state. + inline void reset() noexcept { + cur.reset(); + out.reset(); + } + + inline bool isDone() const noexcept { return cur.isDone(); } + inline void markDone() noexcept { cur.addFlags(FuncValue::kFlagIsDone); } + + FuncValue cur; + FuncValue out; + }; + + struct WorkData { + inline void reset() noexcept { + _archRegs = 0; + _workRegs = 0; + _usedRegs = 0; + _assignedRegs = 0; + _dstRegs = 0; + _dstShuf = 0; + _numSwaps = 0; + _numStackArgs = 0; + memset(_reserved, 0, sizeof(_reserved)); + memset(_physToVarId, kVarIdNone, 32); + } + + inline bool isAssigned(uint32_t regId) const noexcept { + ASMJIT_ASSERT(regId < 32); + return Support::bitTest(_assignedRegs, regId); + } + + inline void assign(uint32_t varId, uint32_t regId) noexcept { + ASMJIT_ASSERT(!isAssigned(regId)); + ASMJIT_ASSERT(_physToVarId[regId] == kVarIdNone); + + _physToVarId[regId] = uint8_t(varId); + _assignedRegs ^= Support::bitMask(regId); + } + + inline void reassign(uint32_t varId, uint32_t newId, uint32_t oldId) noexcept { + ASMJIT_ASSERT( isAssigned(oldId)); + ASMJIT_ASSERT(!isAssigned(newId)); + ASMJIT_ASSERT(_physToVarId[oldId] == varId); + ASMJIT_ASSERT(_physToVarId[newId] == kVarIdNone); + + _physToVarId[oldId] = uint8_t(kVarIdNone); + _physToVarId[newId] = uint8_t(varId); + _assignedRegs ^= Support::bitMask(newId) ^ Support::bitMask(oldId); + } + + inline void swap(uint32_t aVarId, uint32_t aRegId, uint32_t bVarId, uint32_t bRegId) noexcept { + ASMJIT_ASSERT(isAssigned(aRegId)); + ASMJIT_ASSERT(isAssigned(bRegId)); + ASMJIT_ASSERT(_physToVarId[aRegId] == aVarId); + ASMJIT_ASSERT(_physToVarId[bRegId] == bVarId); + + _physToVarId[aRegId] = uint8_t(bVarId); + _physToVarId[bRegId] = uint8_t(aVarId); + } + + inline void unassign(uint32_t varId, uint32_t regId) noexcept { + ASMJIT_ASSERT(isAssigned(regId)); + ASMJIT_ASSERT(_physToVarId[regId] == varId); + + DebugUtils::unused(varId); + _physToVarId[regId] = uint8_t(kVarIdNone); + _assignedRegs ^= Support::bitMask(regId); + } + + inline uint32_t archRegs() const noexcept { return _archRegs; } + inline uint32_t workRegs() const noexcept { return _workRegs; } + inline uint32_t usedRegs() const noexcept { return _usedRegs; } + inline uint32_t assignedRegs() const noexcept { return _assignedRegs; } + inline uint32_t dstRegs() const noexcept { return _dstRegs; } + inline uint32_t availableRegs() const noexcept { return _workRegs & ~_assignedRegs; } + + uint32_t _archRegs; //!< All allocable registers provided by the architecture. + uint32_t _workRegs; //!< All registers that can be used by the shuffler. + uint32_t _usedRegs; //!< Registers used by the shuffler (all). + uint32_t _assignedRegs; //!< Assigned registers. + uint32_t _dstRegs; //!< Destination registers assigned to arguments or SA. + uint32_t _dstShuf; //!< Destination registers that require shuffling. + uint8_t _numSwaps; //!< Number of register swaps. + uint8_t _numStackArgs; //!< Number of stack loads. + uint8_t _reserved[6]; //!< Reserved (only used as padding). + uint8_t _physToVarId[32]; //!< Physical ID to variable ID mapping. + }; + + uint8_t _arch; + bool _hasStackSrc; //!< Has arguments passed via stack (SRC). + bool _hasPreservedFP; //!< Has preserved frame-pointer (FP). + uint8_t _stackDstMask; //!< Has arguments assigned to stack (DST). + uint8_t _regSwapsMask; //!< Register swap groups (bit-mask). + uint8_t _saVarId; + uint32_t _varCount; + WorkData _workData[BaseReg::kGroupVirt]; + Var _vars[kFuncArgCountLoHi + 1]; + + X86FuncArgsContext() noexcept; + + inline uint32_t arch() const noexcept { return _arch; } + inline uint32_t varCount() const noexcept { return _varCount; } + + inline Var& var(size_t varId) noexcept { return _vars[varId]; } + inline const Var& var(size_t varId) const noexcept { return _vars[varId]; } + inline size_t indexOf(const Var* var) const noexcept { return (size_t)(var - _vars); } + + Error initWorkData(const FuncFrame& frame, const FuncArgsAssignment& args) noexcept; + Error markScratchRegs(FuncFrame& frame) noexcept; + Error markDstRegsDirty(FuncFrame& frame) noexcept; + Error markStackArgsReg(FuncFrame& frame) noexcept; +}; + +X86FuncArgsContext::X86FuncArgsContext() noexcept { + _arch = Environment::kArchUnknown; + _varCount = 0; + _hasStackSrc = false; + _hasPreservedFP = false; + _stackDstMask = 0; + _regSwapsMask = 0; + _saVarId = kVarIdNone; + + for (uint32_t group = 0; group < BaseReg::kGroupVirt; group++) + _workData[group].reset(); +} + +ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::initWorkData(const FuncFrame& frame, const FuncArgsAssignment& args) noexcept { + // The code has to be updated if this changes. + ASMJIT_ASSERT(BaseReg::kGroupVirt == 4); + + uint32_t i; + const FuncDetail& func = *args.funcDetail(); + + // Initialize Architecture. + uint32_t arch = func.callConv().arch(); + uint32_t archRegCount = Environment::is32Bit(arch) ? 8 : 16; + + _arch = uint8_t(arch); + + // Initialize `_archRegs`. + _workData[Reg::kGroupGp ]._archRegs = Support::lsbMask<uint32_t>(archRegCount) & ~Support::bitMask(Gp::kIdSp); + _workData[Reg::kGroupVec ]._archRegs = Support::lsbMask<uint32_t>(archRegCount); + _workData[Reg::kGroupMm ]._archRegs = Support::lsbMask<uint32_t>(8); + _workData[Reg::kGroupKReg]._archRegs = Support::lsbMask<uint32_t>(8); + + if (frame.hasPreservedFP()) + _workData[Reg::kGroupGp]._archRegs &= ~Support::bitMask(Gp::kIdBp); + + // Extract information from all function arguments/assignments and build Var[] array. + uint32_t varId = 0; + for (i = 0; i < kFuncArgCountLoHi; i++) { + const FuncValue& dst_ = args.arg(i); + if (!dst_.isAssigned()) + continue; + + const FuncValue& src_ = func.arg(i); + if (ASMJIT_UNLIKELY(!src_.isAssigned())) + return DebugUtils::errored(kErrorInvalidState); + + Var& var = _vars[varId]; + var.init(src_, dst_); + + FuncValue& src = var.cur; + FuncValue& dst = var.out; + + uint32_t dstGroup = 0xFFFFFFFFu; + uint32_t dstId = BaseReg::kIdBad; + WorkData* dstWd = nullptr; + + // Not supported. + if (src.isIndirect()) + return DebugUtils::errored(kErrorInvalidAssignment); + + if (dst.isReg()) { + uint32_t dstType = dst.regType(); + if (ASMJIT_UNLIKELY(dstType >= Reg::kTypeCount)) + return DebugUtils::errored(kErrorInvalidRegType); + + // Copy TypeId from source if the destination doesn't have it. The RA + // used by BaseCompiler would never leave TypeId undefined, but users + // of FuncAPI can just assign phys regs without specifying the type. + if (!dst.hasTypeId()) + dst.setTypeId(Reg::typeIdOf(dst.regType())); + + dstGroup = Reg::groupOf(dstType); + if (ASMJIT_UNLIKELY(dstGroup >= BaseReg::kGroupVirt)) + return DebugUtils::errored(kErrorInvalidRegGroup); + + dstWd = &_workData[dstGroup]; + dstId = dst.regId(); + if (ASMJIT_UNLIKELY(dstId >= 32 || !Support::bitTest(dstWd->archRegs(), dstId))) + return DebugUtils::errored(kErrorInvalidPhysId); + + if (ASMJIT_UNLIKELY(Support::bitTest(dstWd->dstRegs(), dstId))) + return DebugUtils::errored(kErrorOverlappedRegs); + + dstWd->_dstRegs |= Support::bitMask(dstId); + dstWd->_dstShuf |= Support::bitMask(dstId); + dstWd->_usedRegs |= Support::bitMask(dstId); + } + else { + if (!dst.hasTypeId()) + dst.setTypeId(src.typeId()); + + RegInfo regInfo = x86GetRegForMemToMemMove(arch, dst.typeId(), src.typeId()); + if (ASMJIT_UNLIKELY(!regInfo.isValid())) + return DebugUtils::errored(kErrorInvalidState); + _stackDstMask = uint8_t(_stackDstMask | Support::bitMask(regInfo.group())); + } + + if (src.isReg()) { + uint32_t srcId = src.regId(); + uint32_t srcGroup = Reg::groupOf(src.regType()); + + if (dstGroup == srcGroup) { + dstWd->assign(varId, srcId); + + // The best case, register is allocated where it is expected to be. + if (dstId == srcId) + var.markDone(); + } + else { + if (ASMJIT_UNLIKELY(srcGroup >= BaseReg::kGroupVirt)) + return DebugUtils::errored(kErrorInvalidState); + + WorkData& srcData = _workData[srcGroup]; + srcData.assign(varId, srcId); + } + } + else { + if (dstWd) + dstWd->_numStackArgs++; + _hasStackSrc = true; + } + + varId++; + } + + // Initialize WorkData::workRegs. + for (i = 0; i < BaseReg::kGroupVirt; i++) + _workData[i]._workRegs = (_workData[i].archRegs() & (frame.dirtyRegs(i) | ~frame.preservedRegs(i))) | _workData[i].dstRegs() | _workData[i].assignedRegs(); + + // Create a variable that represents `SARegId` if necessary. + bool saRegRequired = _hasStackSrc && frame.hasDynamicAlignment() && !frame.hasPreservedFP(); + + WorkData& gpRegs = _workData[BaseReg::kGroupGp]; + uint32_t saCurRegId = frame.saRegId(); + uint32_t saOutRegId = args.saRegId(); + + if (saCurRegId != BaseReg::kIdBad) { + // Check if the provided `SARegId` doesn't collide with input registers. + if (ASMJIT_UNLIKELY(gpRegs.isAssigned(saCurRegId))) + return DebugUtils::errored(kErrorOverlappedRegs); + } + + if (saOutRegId != BaseReg::kIdBad) { + // Check if the provided `SARegId` doesn't collide with argument assignments. + if (ASMJIT_UNLIKELY(Support::bitTest(gpRegs.dstRegs(), saOutRegId))) + return DebugUtils::errored(kErrorOverlappedRegs); + saRegRequired = true; + } + + if (saRegRequired) { + uint32_t ptrTypeId = Environment::is32Bit(arch) ? Type::kIdU32 : Type::kIdU64; + uint32_t ptrRegType = Environment::is32Bit(arch) ? BaseReg::kTypeGp32 : BaseReg::kTypeGp64; + + _saVarId = uint8_t(varId); + _hasPreservedFP = frame.hasPreservedFP(); + + Var& var = _vars[varId]; + var.reset(); + + if (saCurRegId == BaseReg::kIdBad) { + if (saOutRegId != BaseReg::kIdBad && !gpRegs.isAssigned(saOutRegId)) { + saCurRegId = saOutRegId; + } + else { + uint32_t availableRegs = gpRegs.availableRegs(); + if (!availableRegs) + availableRegs = gpRegs.archRegs() & ~gpRegs.workRegs(); + + if (ASMJIT_UNLIKELY(!availableRegs)) + return DebugUtils::errored(kErrorNoMorePhysRegs); + + saCurRegId = Support::ctz(availableRegs); + } + } + + var.cur.initReg(ptrRegType, saCurRegId, ptrTypeId); + gpRegs.assign(varId, saCurRegId); + gpRegs._workRegs |= Support::bitMask(saCurRegId); + + if (saOutRegId != BaseReg::kIdBad) { + var.out.initReg(ptrRegType, saOutRegId, ptrTypeId); + gpRegs._dstRegs |= Support::bitMask(saOutRegId); + gpRegs._workRegs |= Support::bitMask(saOutRegId); + } + else { + var.markDone(); + } + + varId++; + } + + _varCount = varId; + + // Detect register swaps. + for (varId = 0; varId < _varCount; varId++) { + Var& var = _vars[varId]; + if (var.cur.isReg() && var.out.isReg()) { + uint32_t srcId = var.cur.regId(); + uint32_t dstId = var.out.regId(); + + uint32_t group = Reg::groupOf(var.cur.regType()); + if (group != Reg::groupOf(var.out.regType())) + continue; + + WorkData& wd = _workData[group]; + if (wd.isAssigned(dstId)) { + Var& other = _vars[wd._physToVarId[dstId]]; + if (Reg::groupOf(other.out.regType()) == group && other.out.regId() == srcId) { + wd._numSwaps++; + _regSwapsMask = uint8_t(_regSwapsMask | Support::bitMask(group)); + } + } + } + } + + return kErrorOk; +} + +ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markDstRegsDirty(FuncFrame& frame) noexcept { + for (uint32_t i = 0; i < BaseReg::kGroupVirt; i++) { + WorkData& wd = _workData[i]; + uint32_t regs = wd.usedRegs() | wd._dstShuf; + + wd._workRegs |= regs; + frame.addDirtyRegs(i, regs); + } + + return kErrorOk; +} + +ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markScratchRegs(FuncFrame& frame) noexcept { + uint32_t groupMask = 0; + + // Handle stack to stack moves. + groupMask |= _stackDstMask; + + // Handle register swaps. + groupMask |= _regSwapsMask & ~Support::bitMask(BaseReg::kGroupGp); + + if (!groupMask) + return kErrorOk; + + // Selects one dirty register per affected group that can be used as a scratch register. + for (uint32_t group = 0; group < BaseReg::kGroupVirt; group++) { + if (Support::bitTest(groupMask, group)) { + WorkData& wd = _workData[group]; + + // Initially, pick some clobbered or dirty register. + uint32_t workRegs = wd.workRegs(); + uint32_t regs = workRegs & ~(wd.usedRegs() | wd._dstShuf); + + // If that didn't work out pick some register which is not in 'used'. + if (!regs) + regs = workRegs & ~wd.usedRegs(); + + // If that didn't work out pick any other register that is allocable. + // This last resort case will, however, result in marking one more + // register dirty. + if (!regs) + regs = wd.archRegs() & ~workRegs; + + // If that didn't work out we will have to use XORs instead of MOVs. + if (!regs) + continue; + + uint32_t regMask = Support::blsi(regs); + wd._workRegs |= regMask; + frame.addDirtyRegs(group, regMask); + } + } + + return kErrorOk; +} + +ASMJIT_FAVOR_SIZE Error X86FuncArgsContext::markStackArgsReg(FuncFrame& frame) noexcept { + if (_saVarId != kVarIdNone) { + const Var& var = _vars[_saVarId]; + frame.setSARegId(var.cur.regId()); + } + else if (frame.hasPreservedFP()) { + // Always EBP|RBP if the frame-pointer isn't omitted. + frame.setSARegId(Gp::kIdBp); + } + + return kErrorOk; +} + +// ============================================================================ +// [asmjit::X86Internal - FrameLayout] +// ============================================================================ + +ASMJIT_FAVOR_SIZE Error X86Internal::initFuncFrame(FuncFrame& frame, const FuncDetail& func) noexcept { + uint32_t arch = func.callConv().arch(); + + // Initializing FuncFrame means making a copy of some properties of `func`. + // Properties like `_localStackSize` will be set by the user before the frame + // is finalized. + frame.reset(); + + frame._arch = uint8_t(arch); + frame._spRegId = Gp::kIdSp; + frame._saRegId = Gp::kIdBad; + + uint32_t naturalStackAlignment = func.callConv().naturalStackAlignment(); + uint32_t minDynamicAlignment = Support::max<uint32_t>(naturalStackAlignment, 16); + + if (minDynamicAlignment == naturalStackAlignment) + minDynamicAlignment <<= 1; + + frame._naturalStackAlignment = uint8_t(naturalStackAlignment); + frame._minDynamicAlignment = uint8_t(minDynamicAlignment); + frame._redZoneSize = uint8_t(func.redZoneSize()); + frame._spillZoneSize = uint8_t(func.spillZoneSize()); + frame._finalStackAlignment = uint8_t(frame._naturalStackAlignment); + + if (func.hasFlag(CallConv::kFlagCalleePopsStack)) { + frame._calleeStackCleanup = uint16_t(func.argStackSize()); + } + + // Initial masks of dirty and preserved registers. + for (uint32_t group = 0; group < BaseReg::kGroupVirt; group++) { + frame._dirtyRegs[group] = func.usedRegs(group); + frame._preservedRegs[group] = func.preservedRegs(group); + } + + // Exclude ESP/RSP - this register is never included in saved GP regs. + frame._preservedRegs[BaseReg::kGroupGp] &= ~Support::bitMask(Gp::kIdSp); + + return kErrorOk; +} + +ASMJIT_FAVOR_SIZE Error X86Internal::finalizeFuncFrame(FuncFrame& frame) noexcept { + uint32_t registerSize = Environment::registerSizeFromArch(frame.arch()); + + // The final stack alignment must be updated accordingly to call and local stack alignments. + uint32_t stackAlignment = frame._finalStackAlignment; + ASMJIT_ASSERT(stackAlignment == Support::max(frame._naturalStackAlignment, + frame._callStackAlignment, + frame._localStackAlignment)); + + // TODO: Must be configurable. + uint32_t vecSize = 16; + + bool hasFP = frame.hasPreservedFP(); + bool hasDA = frame.hasDynamicAlignment(); + + // Include EBP|RBP if the function preserves the frame-pointer. + if (hasFP) + frame._dirtyRegs[Reg::kGroupGp] |= Support::bitMask(Gp::kIdBp); + + // These two are identical if the function doesn't align its stack dynamically. + uint32_t saRegId = frame.saRegId(); + if (saRegId == BaseReg::kIdBad) + saRegId = Gp::kIdSp; + + // Fix stack arguments base-register from ESP|RSP to EBP|RBP in case it was + // not picked before and the function performs dynamic stack alignment. + if (hasDA && saRegId == Gp::kIdSp) + saRegId = Gp::kIdBp; + + // Mark as dirty any register but ESP|RSP if used as SA pointer. + if (saRegId != Gp::kIdSp) + frame._dirtyRegs[Reg::kGroupGp] |= Support::bitMask(saRegId); + + frame._spRegId = uint8_t(Gp::kIdSp); + frame._saRegId = uint8_t(saRegId); + + // Setup stack size used to save preserved registers. + frame._gpSaveSize = uint16_t(Support::popcnt(frame.savedRegs(Reg::kGroupGp )) * registerSize); + frame._nonGpSaveSize = uint16_t(Support::popcnt(frame.savedRegs(Reg::kGroupVec )) * vecSize + + Support::popcnt(frame.savedRegs(Reg::kGroupMm )) * 8 + + Support::popcnt(frame.savedRegs(Reg::kGroupKReg)) * 8); + + uint32_t v = 0; // The beginning of the stack frame relative to SP after prolog. + v += frame.callStackSize(); // Count 'callStackSize' <- This is used to call functions. + v = Support::alignUp(v, stackAlignment); // Align to function's stack alignment. + + frame._localStackOffset = v; // Store 'localStackOffset' <- Function's local stack starts here. + v += frame.localStackSize(); // Count 'localStackSize' <- Function's local stack ends here. + + // If the function's stack must be aligned, calculate the alignment necessary + // to store vector registers, and set `FuncFrame::kAttrAlignedVecSR` to inform + // PEI that it can use instructions that perform aligned stores/loads. + if (stackAlignment >= vecSize && frame._nonGpSaveSize) { + frame.addAttributes(FuncFrame::kAttrAlignedVecSR); + v = Support::alignUp(v, vecSize); // Align '_nonGpSaveOffset'. + } + + frame._nonGpSaveOffset = v; // Store '_nonGpSaveOffset' <- Non-GP Save/Restore starts here. + v += frame._nonGpSaveSize; // Count '_nonGpSaveSize' <- Non-GP Save/Restore ends here. + + // Calculate if dynamic alignment (DA) slot (stored as offset relative to SP) is required and its offset. + if (hasDA && !hasFP) { + frame._daOffset = v; // Store 'daOffset' <- DA pointer would be stored here. + v += registerSize; // Count 'daOffset'. + } + else { + frame._daOffset = FuncFrame::kTagInvalidOffset; + } + + // The return address should be stored after GP save/restore regs. It has + // the same size as `registerSize` (basically the native register/pointer + // size). We don't adjust it now as `v` now contains the exact size that the + // function requires to adjust (call frame + stack frame, vec stack size). + // The stack (if we consider this size) is misaligned now, as it's always + // aligned before the function call - when `call()` is executed it pushes + // the current EIP|RIP onto the stack, and misaligns it by 12 or 8 bytes + // (depending on the architecture). So count number of bytes needed to align + // it up to the function's CallFrame (the beginning). + if (v || frame.hasFuncCalls()) + v += Support::alignUpDiff(v + frame.gpSaveSize() + registerSize, stackAlignment); + + frame._gpSaveOffset = v; // Store 'gpSaveOffset' <- Function's GP Save/Restore starts here. + frame._stackAdjustment = v; // Store 'stackAdjustment' <- SA used by 'add zsp, SA' and 'sub zsp, SA'. + + v += frame._gpSaveSize; // Count 'gpSaveSize' <- Function's GP Save/Restore ends here. + v += registerSize; // Count 'ReturnAddress' <- As CALL pushes onto stack. + + // If the function performs dynamic stack alignment then the stack-adjustment must be aligned. + if (hasDA) + frame._stackAdjustment = Support::alignUp(frame._stackAdjustment, stackAlignment); + + uint32_t saInvOff = FuncFrame::kTagInvalidOffset; + uint32_t saTmpOff = registerSize + frame._gpSaveSize; + + // Calculate where the function arguments start relative to SP. + frame._saOffsetFromSP = hasDA ? saInvOff : v; + + // Calculate where the function arguments start relative to FP or user-provided register. + frame._saOffsetFromSA = hasFP ? registerSize * 2 // Return address + frame pointer. + : saTmpOff; // Return address + all saved GP regs. + + return kErrorOk; +} + +// ============================================================================ +// [asmjit::X86Internal - ArgsToFrameInfo] +// ============================================================================ + +ASMJIT_FAVOR_SIZE Error X86Internal::argsToFuncFrame(const FuncArgsAssignment& args, FuncFrame& frame) noexcept { + X86FuncArgsContext ctx; + ASMJIT_PROPAGATE(ctx.initWorkData(frame, args)); + ASMJIT_PROPAGATE(ctx.markDstRegsDirty(frame)); + ASMJIT_PROPAGATE(ctx.markScratchRegs(frame)); + ASMJIT_PROPAGATE(ctx.markStackArgsReg(frame)); + return kErrorOk; +} + +// ============================================================================ +// [asmjit::X86Internal - Emit Helpers] +// ============================================================================ + +ASMJIT_FAVOR_SIZE Error X86Internal::emitRegMove(Emitter* emitter, + const Operand_& dst_, + const Operand_& src_, uint32_t typeId, bool avxEnabled, const char* comment) { + + // Invalid or abstract TypeIds are not allowed. + ASMJIT_ASSERT(Type::isValid(typeId) && !Type::isAbstract(typeId)); + + Operand dst(dst_); + Operand src(src_); + + uint32_t instId = Inst::kIdNone; + uint32_t memFlags = 0; + uint32_t overrideMemSize = 0; + + enum MemFlags : uint32_t { + kDstMem = 0x1, + kSrcMem = 0x2 + }; + + // Detect memory operands and patch them to have the same size as the register. + // BaseCompiler always sets memory size of allocs and spills, so it shouldn't + // be really necessary, however, after this function was separated from Compiler + // it's better to make sure that the size is always specified, as we can use + // 'movzx' and 'movsx' that rely on it. + if (dst.isMem()) { memFlags |= kDstMem; dst.as<Mem>().setSize(src.size()); } + if (src.isMem()) { memFlags |= kSrcMem; src.as<Mem>().setSize(dst.size()); } + + switch (typeId) { + case Type::kIdI8: + case Type::kIdU8: + case Type::kIdI16: + case Type::kIdU16: + // Special case - 'movzx' load. + if (memFlags & kSrcMem) { + instId = Inst::kIdMovzx; + dst.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + } + else if (!memFlags) { + // Change both destination and source registers to GPD (safer, no dependencies). + dst.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + src.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + } + ASMJIT_FALLTHROUGH; + + case Type::kIdI32: + case Type::kIdU32: + case Type::kIdI64: + case Type::kIdU64: + instId = Inst::kIdMov; + break; + + case Type::kIdMmx32: + instId = Inst::kIdMovd; + if (memFlags) break; + ASMJIT_FALLTHROUGH; + + case Type::kIdMmx64 : instId = Inst::kIdMovq ; break; + case Type::kIdMask8 : instId = Inst::kIdKmovb; break; + case Type::kIdMask16: instId = Inst::kIdKmovw; break; + case Type::kIdMask32: instId = Inst::kIdKmovd; break; + case Type::kIdMask64: instId = Inst::kIdKmovq; break; + + default: { + uint32_t elementTypeId = Type::baseOf(typeId); + if (Type::isVec32(typeId) && memFlags) { + overrideMemSize = 4; + if (elementTypeId == Type::kIdF32) + instId = avxEnabled ? Inst::kIdVmovss : Inst::kIdMovss; + else + instId = avxEnabled ? Inst::kIdVmovd : Inst::kIdMovd; + break; + } + + if (Type::isVec64(typeId) && memFlags) { + overrideMemSize = 8; + if (elementTypeId == Type::kIdF64) + instId = avxEnabled ? Inst::kIdVmovsd : Inst::kIdMovsd; + else + instId = avxEnabled ? Inst::kIdVmovq : Inst::kIdMovq; + break; + } + + if (elementTypeId == Type::kIdF32) + instId = avxEnabled ? Inst::kIdVmovaps : Inst::kIdMovaps; + else if (elementTypeId == Type::kIdF64) + instId = avxEnabled ? Inst::kIdVmovapd : Inst::kIdMovapd; + else if (typeId <= Type::_kIdVec256End) + instId = avxEnabled ? Inst::kIdVmovdqa : Inst::kIdMovdqa; + else if (elementTypeId <= Type::kIdU32) + instId = Inst::kIdVmovdqa32; + else + instId = Inst::kIdVmovdqa64; + break; + } + } + + if (!instId) + return DebugUtils::errored(kErrorInvalidState); + + if (overrideMemSize) { + if (dst.isMem()) dst.as<Mem>().setSize(overrideMemSize); + if (src.isMem()) src.as<Mem>().setSize(overrideMemSize); + } + + emitter->setInlineComment(comment); + return emitter->emit(instId, dst, src); +} + +ASMJIT_FAVOR_SIZE Error X86Internal::emitArgMove(Emitter* emitter, + const Reg& dst_, uint32_t dstTypeId, + const Operand_& src_, uint32_t srcTypeId, bool avxEnabled, const char* comment) { + + // Deduce optional `dstTypeId`, which may be `Type::kIdVoid` in some cases. + if (!dstTypeId) + dstTypeId = opData.archRegs.regTypeToTypeId[dst_.type()]; + + // Invalid or abstract TypeIds are not allowed. + ASMJIT_ASSERT(Type::isValid(dstTypeId) && !Type::isAbstract(dstTypeId)); + ASMJIT_ASSERT(Type::isValid(srcTypeId) && !Type::isAbstract(srcTypeId)); + + Reg dst(dst_); + Operand src(src_); + + uint32_t dstSize = Type::sizeOf(dstTypeId); + uint32_t srcSize = Type::sizeOf(srcTypeId); + + uint32_t instId = Inst::kIdNone; + + // Not a real loop, just 'break' is nicer than 'goto'. + for (;;) { + if (Type::isInt(dstTypeId)) { + if (Type::isInt(srcTypeId)) { + instId = Inst::kIdMovsx; + uint32_t typeOp = (dstTypeId << 8) | srcTypeId; + + // Sign extend by using 'movsx'. + if (typeOp == ((Type::kIdI16 << 8) | Type::kIdI8 ) || + typeOp == ((Type::kIdI32 << 8) | Type::kIdI8 ) || + typeOp == ((Type::kIdI32 << 8) | Type::kIdI16) || + typeOp == ((Type::kIdI64 << 8) | Type::kIdI8 ) || + typeOp == ((Type::kIdI64 << 8) | Type::kIdI16)) + break; + + // Sign extend by using 'movsxd'. + instId = Inst::kIdMovsxd; + if (typeOp == ((Type::kIdI64 << 8) | Type::kIdI32)) + break; + } + + if (Type::isInt(srcTypeId) || src_.isMem()) { + // Zero extend by using 'movzx' or 'mov'. + if (dstSize <= 4 && srcSize < 4) { + instId = Inst::kIdMovzx; + dst.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + } + else { + // We should have caught all possibilities where `srcSize` is less + // than 4, so we don't have to worry about 'movzx' anymore. Minimum + // size is enough to determine if we want 32-bit or 64-bit move. + instId = Inst::kIdMov; + srcSize = Support::min(srcSize, dstSize); + + dst.setSignature(srcSize == 4 ? Reg::signatureOfT<Reg::kTypeGpd>() + : Reg::signatureOfT<Reg::kTypeGpq>()); + if (src.isReg()) + src.setSignature(dst.signature()); + } + break; + } + + // NOTE: The previous branch caught all memory sources, from here it's + // always register to register conversion, so catch the remaining cases. + srcSize = Support::min(srcSize, dstSize); + + if (Type::isMmx(srcTypeId)) { + // 64-bit move. + instId = Inst::kIdMovq; + if (srcSize == 8) + break; + + // 32-bit move. + instId = Inst::kIdMovd; + dst.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + break; + } + + if (Type::isMask(srcTypeId)) { + instId = x86KmovFromSize(srcSize); + dst.setSignature(srcSize <= 4 ? Reg::signatureOfT<Reg::kTypeGpd>() + : Reg::signatureOfT<Reg::kTypeGpq>()); + break; + } + + if (Type::isVec(srcTypeId)) { + // 64-bit move. + instId = avxEnabled ? Inst::kIdVmovq : Inst::kIdMovq; + if (srcSize == 8) + break; + + // 32-bit move. + instId = avxEnabled ? Inst::kIdVmovd : Inst::kIdMovd; + dst.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + break; + } + } + + if (Type::isMmx(dstTypeId)) { + instId = Inst::kIdMovq; + srcSize = Support::min(srcSize, dstSize); + + if (Type::isInt(srcTypeId) || src.isMem()) { + // 64-bit move. + if (srcSize == 8) + break; + + // 32-bit move. + instId = Inst::kIdMovd; + if (src.isReg()) + src.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + break; + } + + if (Type::isMmx(srcTypeId)) + break; + + // This will hurt if `avxEnabled`. + instId = Inst::kIdMovdq2q; + if (Type::isVec(srcTypeId)) +break; + } + + if (Type::isMask(dstTypeId)) { + srcSize = Support::min(srcSize, dstSize); + + if (Type::isInt(srcTypeId) || Type::isMask(srcTypeId) || src.isMem()) { + instId = x86KmovFromSize(srcSize); + if (Reg::isGp(src) && srcSize <= 4) + src.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + break; + } + } + + if (Type::isVec(dstTypeId)) { + // By default set destination to XMM, will be set to YMM|ZMM if needed. + dst.setSignature(Reg::signatureOfT<Reg::kTypeXmm>()); + + // This will hurt if `avxEnabled`. + if (Reg::isMm(src)) { + // 64-bit move. + instId = Inst::kIdMovq2dq; + break; + } + + // Argument conversion. + uint32_t dstElement = Type::baseOf(dstTypeId); + uint32_t srcElement = Type::baseOf(srcTypeId); + + if (dstElement == Type::kIdF32 && srcElement == Type::kIdF64) { + srcSize = Support::min(dstSize * 2, srcSize); + dstSize = srcSize / 2; + + if (srcSize <= 8) + instId = avxEnabled ? Inst::kIdVcvtss2sd : Inst::kIdCvtss2sd; + else + instId = avxEnabled ? Inst::kIdVcvtps2pd : Inst::kIdCvtps2pd; + + if (dstSize == 32) + dst.setSignature(Reg::signatureOfT<Reg::kTypeYmm>()); + if (src.isReg()) + src.setSignature(Reg::signatureOfVecBySize(srcSize)); + break; + } + + if (dstElement == Type::kIdF64 && srcElement == Type::kIdF32) { + srcSize = Support::min(dstSize, srcSize * 2) / 2; + dstSize = srcSize * 2; + + if (srcSize <= 4) + instId = avxEnabled ? Inst::kIdVcvtsd2ss : Inst::kIdCvtsd2ss; + else + instId = avxEnabled ? Inst::kIdVcvtpd2ps : Inst::kIdCvtpd2ps; + + dst.setSignature(Reg::signatureOfVecBySize(dstSize)); + if (src.isReg() && srcSize >= 32) + src.setSignature(Reg::signatureOfT<Reg::kTypeYmm>()); + break; + } + + srcSize = Support::min(srcSize, dstSize); + if (Reg::isGp(src) || src.isMem()) { + // 32-bit move. + if (srcSize <= 4) { + instId = avxEnabled ? Inst::kIdVmovd : Inst::kIdMovd; + if (src.isReg()) + src.setSignature(Reg::signatureOfT<Reg::kTypeGpd>()); + break; + } + + // 64-bit move. + if (srcSize == 8) { + instId = avxEnabled ? Inst::kIdVmovq : Inst::kIdMovq; + break; + } + } + + if (Reg::isVec(src) || src.isMem()) { + instId = avxEnabled ? Inst::kIdVmovaps : Inst::kIdMovaps; + + if (src.isMem() && srcSize < emitter->environment().stackAlignment()) + instId = avxEnabled ? Inst::kIdVmovups : Inst::kIdMovups; + + uint32_t signature = Reg::signatureOfVecBySize(srcSize); + dst.setSignature(signature); + if (src.isReg()) + src.setSignature(signature); + break; + } + } + + return DebugUtils::errored(kErrorInvalidState); + } + + if (src.isMem()) + src.as<Mem>().setSize(srcSize); + + emitter->setInlineComment(comment); + return emitter->emit(instId, dst, src); +} + +// ============================================================================ +// [asmjit::X86Internal - Emit Prolog & Epilog] +// ============================================================================ + +static ASMJIT_INLINE void X86Internal_setupSaveRestoreInfo(uint32_t group, const FuncFrame& frame, Reg& xReg, uint32_t& xInst, uint32_t& xSize) noexcept { + switch (group) { + case Reg::kGroupVec: + xReg = xmm(0); + xInst = x86GetXmmMovInst(frame); + xSize = xReg.size(); + break; + case Reg::kGroupMm: + xReg = mm(0); + xInst = Inst::kIdMovq; + xSize = xReg.size(); + break; + case Reg::kGroupKReg: + xReg = k(0); + xInst = Inst::kIdKmovq; + xSize = xReg.size(); + break; + } +} + +ASMJIT_FAVOR_SIZE Error X86Internal::emitProlog(Emitter* emitter, const FuncFrame& frame) { + uint32_t gpSaved = frame.savedRegs(Reg::kGroupGp); + + Gp zsp = emitter->zsp(); // ESP|RSP register. + Gp zbp = emitter->zbp(); // EBP|RBP register. + Gp gpReg = zsp; // General purpose register (temporary). + Gp saReg = zsp; // Stack-arguments base pointer. + + // Emit: 'push zbp' + // 'mov zbp, zsp'. + if (frame.hasPreservedFP()) { + gpSaved &= ~Support::bitMask(Gp::kIdBp); + ASMJIT_PROPAGATE(emitter->push(zbp)); + ASMJIT_PROPAGATE(emitter->mov(zbp, zsp)); + } + + // Emit: 'push gp' sequence. + { + Support::BitWordIterator<uint32_t> it(gpSaved); + while (it.hasNext()) { + gpReg.setId(it.next()); + ASMJIT_PROPAGATE(emitter->push(gpReg)); + } + } + + // Emit: 'mov saReg, zsp'. + uint32_t saRegId = frame.saRegId(); + if (saRegId != BaseReg::kIdBad && saRegId != Gp::kIdSp) { + saReg.setId(saRegId); + if (frame.hasPreservedFP()) { + if (saRegId != Gp::kIdBp) + ASMJIT_PROPAGATE(emitter->mov(saReg, zbp)); + } + else { + ASMJIT_PROPAGATE(emitter->mov(saReg, zsp)); + } + } + + // Emit: 'and zsp, StackAlignment'. + if (frame.hasDynamicAlignment()) { + ASMJIT_PROPAGATE(emitter->and_(zsp, -int32_t(frame.finalStackAlignment()))); + } + + // Emit: 'sub zsp, StackAdjustment'. + if (frame.hasStackAdjustment()) { + ASMJIT_PROPAGATE(emitter->sub(zsp, frame.stackAdjustment())); + } + + // Emit: 'mov [zsp + DAOffset], saReg'. + if (frame.hasDynamicAlignment() && frame.hasDAOffset()) { + Mem saMem = ptr(zsp, int32_t(frame.daOffset())); + ASMJIT_PROPAGATE(emitter->mov(saMem, saReg)); + } + + // Emit 'movxxx [zsp + X], {[x|y|z]mm, k}'. + { + Reg xReg; + Mem xBase = ptr(zsp, int32_t(frame.nonGpSaveOffset())); + + uint32_t xInst; + uint32_t xSize; + + for (uint32_t group = 1; group < BaseReg::kGroupVirt; group++) { + Support::BitWordIterator<uint32_t> it(frame.savedRegs(group)); + if (it.hasNext()) { + X86Internal_setupSaveRestoreInfo(group, frame, xReg, xInst, xSize); + do { + xReg.setId(it.next()); + ASMJIT_PROPAGATE(emitter->emit(xInst, xBase, xReg)); + xBase.addOffsetLo32(int32_t(xSize)); + } while (it.hasNext()); + } + } + } + + return kErrorOk; +} + +ASMJIT_FAVOR_SIZE Error X86Internal::emitEpilog(Emitter* emitter, const FuncFrame& frame) { + uint32_t i; + uint32_t regId; + + uint32_t registerSize = emitter->registerSize(); + uint32_t gpSaved = frame.savedRegs(Reg::kGroupGp); + + Gp zsp = emitter->zsp(); // ESP|RSP register. + Gp zbp = emitter->zbp(); // EBP|RBP register. + Gp gpReg = emitter->zsp(); // General purpose register (temporary). + + // Don't emit 'pop zbp' in the pop sequence, this case is handled separately. + if (frame.hasPreservedFP()) + gpSaved &= ~Support::bitMask(Gp::kIdBp); + + // Emit 'movxxx {[x|y|z]mm, k}, [zsp + X]'. + { + Reg xReg; + Mem xBase = ptr(zsp, int32_t(frame.nonGpSaveOffset())); + + uint32_t xInst; + uint32_t xSize; + + for (uint32_t group = 1; group < BaseReg::kGroupVirt; group++) { + Support::BitWordIterator<uint32_t> it(frame.savedRegs(group)); + if (it.hasNext()) { + X86Internal_setupSaveRestoreInfo(group, frame, xReg, xInst, xSize); + do { + xReg.setId(it.next()); + ASMJIT_PROPAGATE(emitter->emit(xInst, xReg, xBase)); + xBase.addOffsetLo32(int32_t(xSize)); + } while (it.hasNext()); + } + } + } + + // Emit 'emms' and/or 'vzeroupper'. + if (frame.hasMmxCleanup()) ASMJIT_PROPAGATE(emitter->emms()); + if (frame.hasAvxCleanup()) ASMJIT_PROPAGATE(emitter->vzeroupper()); + + if (frame.hasPreservedFP()) { + // Emit 'mov zsp, zbp' or 'lea zsp, [zbp - x]' + int32_t count = int32_t(frame.gpSaveSize() - registerSize); + if (!count) + ASMJIT_PROPAGATE(emitter->mov(zsp, zbp)); + else + ASMJIT_PROPAGATE(emitter->lea(zsp, ptr(zbp, -count))); + } + else { + if (frame.hasDynamicAlignment() && frame.hasDAOffset()) { + // Emit 'mov zsp, [zsp + DsaSlot]'. + Mem saMem = ptr(zsp, int32_t(frame.daOffset())); + ASMJIT_PROPAGATE(emitter->mov(zsp, saMem)); + } + else if (frame.hasStackAdjustment()) { + // Emit 'add zsp, StackAdjustment'. + ASMJIT_PROPAGATE(emitter->add(zsp, int32_t(frame.stackAdjustment()))); + } + } + + // Emit 'pop gp' sequence. + if (gpSaved) { + i = gpSaved; + regId = 16; + + do { + regId--; + if (i & 0x8000) { + gpReg.setId(regId); + ASMJIT_PROPAGATE(emitter->pop(gpReg)); + } + i <<= 1; + } while (regId != 0); + } + + // Emit 'pop zbp'. + if (frame.hasPreservedFP()) + ASMJIT_PROPAGATE(emitter->pop(zbp)); + + // Emit 'ret' or 'ret x'. + if (frame.hasCalleeStackCleanup()) + ASMJIT_PROPAGATE(emitter->emit(Inst::kIdRet, int(frame.calleeStackCleanup()))); + else + ASMJIT_PROPAGATE(emitter->emit(Inst::kIdRet)); + + return kErrorOk; +} + +// ============================================================================ +// [asmjit::X86Internal - Emit Arguments Assignment] +// ============================================================================ + +#ifdef ASMJIT_DUMP_ARGS_ASSIGNMENT +static void dumpFuncValue(String& sb, uint32_t arch, const FuncValue& value) noexcept { + Formatter::formatTypeId(sb, value.typeId()); + sb.append('@'); + + if (value.isIndirect()) + sb.append('['); + + if (value.isReg()) + Formatter::formatRegister(sb, 0, nullptr, arch, value.regType(), value.regId()); + else if (value.isStack()) + sb.appendFormat("[%d]", value.stackOffset()); + else + sb.append("<none>"); + + if (value.isIndirect()) + sb.append(']'); +} + +static void dumpAssignment(String& sb, const X86FuncArgsContext& ctx) noexcept { + typedef X86FuncArgsContext::Var Var; + + uint32_t arch = ctx.arch(); + uint32_t varCount = ctx.varCount(); + + for (uint32_t i = 0; i < varCount; i++) { + const Var& var = ctx.var(i); + const FuncValue& dst = var.out; + const FuncValue& cur = var.cur; + + sb.appendFormat("Var%u: ", i); + dumpFuncValue(sb, arch, dst); + sb.append(" <- "); + dumpFuncValue(sb, arch, cur); + + if (var.isDone()) + sb.append(" {Done}"); + + sb.append('\n'); + } +} +#endif + +ASMJIT_FAVOR_SIZE Error X86Internal::emitArgsAssignment(Emitter* emitter, const FuncFrame& frame, const FuncArgsAssignment& args) { + typedef X86FuncArgsContext::Var Var; + typedef X86FuncArgsContext::WorkData WorkData; + + enum WorkFlags : uint32_t { + kWorkNone = 0x00, + kWorkDidSome = 0x01, + kWorkPending = 0x02, + kWorkPostponed = 0x04 + }; + + X86FuncArgsContext ctx; + ASMJIT_PROPAGATE(ctx.initWorkData(frame, args)); + +#ifdef ASMJIT_DUMP_ARGS_ASSIGNMENT + { + String sb; + dumpAssignment(sb, ctx); + printf("%s\n", sb.data()); + } +#endif + + uint32_t arch = ctx.arch(); + uint32_t varCount = ctx._varCount; + WorkData* workData = ctx._workData; + + // Use AVX if it's enabled. + bool avxEnabled = frame.isAvxEnabled(); + + uint32_t saVarId = ctx._saVarId; + uint32_t saRegId = Gp::kIdSp; + + if (frame.hasDynamicAlignment()) { + if (frame.hasPreservedFP()) + saRegId = Gp::kIdBp; + else + saRegId = saVarId < varCount ? ctx._vars[saVarId].cur.regId() : frame.saRegId(); + } + + RegInfo gpRegInfo = emitter->_gpRegInfo; + + // -------------------------------------------------------------------------- + // Register to stack and stack to stack moves must be first as now we have + // the biggest chance of having as many as possible unassigned registers. + // -------------------------------------------------------------------------- + + if (ctx._stackDstMask) { + // Base address of all arguments passed by stack. + Mem baseArgPtr = ptr(emitter->gpz(saRegId), int32_t(frame.saOffset(saRegId))); + Mem baseStackPtr = ptr(emitter->gpz(Gp::kIdSp), int32_t(0)); + + for (uint32_t varId = 0; varId < varCount; varId++) { + Var& var = ctx._vars[varId]; + + if (!var.out.isStack()) + continue; + + FuncValue& cur = var.cur; + FuncValue& out = var.out; + + ASMJIT_ASSERT(cur.isReg() || cur.isStack()); + Reg reg; + + Mem dstStackPtr = baseStackPtr.cloneAdjusted(out.stackOffset()); + Mem srcStackPtr = baseArgPtr.cloneAdjusted(cur.stackOffset()); + + if (cur.isIndirect()) { + if (cur.isStack()) { + // TODO: Indirect stack. + return DebugUtils::errored(kErrorInvalidAssignment); + } + else { + srcStackPtr = ptr(Gp(gpRegInfo.signature(), cur.regId())); + } + } + + if (cur.isReg() && !cur.isIndirect()) { + WorkData& wd = workData[Reg::groupOf(cur.regType())]; + uint32_t rId = cur.regId(); + + reg.setSignatureAndId(Reg::signatureOf(cur.regType()), rId); + wd.unassign(varId, rId); + } + else { + // Stack to reg move - tricky since we move stack to stack we can decide which + // register to use. In general we follow the rule that IntToInt moves will use + // GP regs with possibility to signature or zero extend, and all other moves will + // either use GP or VEC regs depending on the size of the move. + RegInfo rInfo = x86GetRegForMemToMemMove(arch, out.typeId(), cur.typeId()); + if (ASMJIT_UNLIKELY(!rInfo.isValid())) + return DebugUtils::errored(kErrorInvalidState); + + WorkData& wd = workData[rInfo.group()]; + uint32_t availableRegs = wd.availableRegs(); + if (ASMJIT_UNLIKELY(!availableRegs)) + return DebugUtils::errored(kErrorInvalidState); + + uint32_t rId = Support::ctz(availableRegs); + reg.setSignatureAndId(rInfo.signature(), rId); + + ASMJIT_PROPAGATE(emitArgMove(emitter, reg, out.typeId(), srcStackPtr, cur.typeId(), avxEnabled)); + } + + if (cur.isIndirect() && cur.isReg()) + workData[BaseReg::kGroupGp].unassign(varId, cur.regId()); + + // Register to stack move. + ASMJIT_PROPAGATE(emitRegMove(emitter, dstStackPtr, reg, cur.typeId(), avxEnabled)); + var.markDone(); + } + } + + // -------------------------------------------------------------------------- + // Shuffle all registers that are currently assigned accordingly to target + // assignment. + // -------------------------------------------------------------------------- + + uint32_t workFlags = kWorkNone; + for (;;) { + for (uint32_t varId = 0; varId < varCount; varId++) { + Var& var = ctx._vars[varId]; + if (var.isDone() || !var.cur.isReg()) + continue; + + FuncValue& cur = var.cur; + FuncValue& out = var.out; + + uint32_t curGroup = Reg::groupOf(cur.regType()); + uint32_t outGroup = Reg::groupOf(out.regType()); + + uint32_t curId = cur.regId(); + uint32_t outId = out.regId(); + + if (curGroup != outGroup) { + // TODO: Conversion is not supported. + return DebugUtils::errored(kErrorInvalidAssignment); + } + else { + WorkData& wd = workData[outGroup]; + if (!wd.isAssigned(outId)) { +EmitMove: + ASMJIT_PROPAGATE( + emitArgMove(emitter, + Reg::fromTypeAndId(out.regType(), outId), out.typeId(), + Reg::fromTypeAndId(cur.regType(), curId), cur.typeId(), avxEnabled)); + + wd.reassign(varId, outId, curId); + cur.initReg(out.regType(), outId, out.typeId()); + + if (outId == out.regId()) + var.markDone(); + workFlags |= kWorkDidSome | kWorkPending; + } + else { + uint32_t altId = wd._physToVarId[outId]; + Var& altVar = ctx._vars[altId]; + + if (!altVar.out.isInitialized() || (altVar.out.isReg() && altVar.out.regId() == curId)) { + // Swap operation is possible only between two GP registers. + if (curGroup == Reg::kGroupGp) { + uint32_t highestType = Support::max(cur.regType(), altVar.cur.regType()); + uint32_t signature = highestType == Reg::kTypeGpq ? Reg::signatureOfT<Reg::kTypeGpq>() + : Reg::signatureOfT<Reg::kTypeGpd>(); + + ASMJIT_PROPAGATE(emitter->emit(Inst::kIdXchg, Reg(signature, outId), Reg(signature, curId))); + wd.swap(varId, curId, altId, outId); + cur.setRegId(outId); + var.markDone(); + altVar.cur.setRegId(curId); + + if (altVar.out.isInitialized()) + altVar.markDone(); + workFlags |= kWorkDidSome; + } + else { + // If there is a scratch register it can be used to perform the swap. + uint32_t availableRegs = wd.availableRegs(); + if (availableRegs) { + uint32_t inOutRegs = wd.dstRegs(); + if (availableRegs & ~inOutRegs) + availableRegs &= ~inOutRegs; + outId = Support::ctz(availableRegs); + goto EmitMove; + } + else { + workFlags |= kWorkPending; + } + } + } + else { + workFlags |= kWorkPending; + } + } + } + } + + if (!(workFlags & kWorkPending)) + break; + + // If we did nothing twice it means that something is really broken. + if ((workFlags & (kWorkDidSome | kWorkPostponed)) == kWorkPostponed) + return DebugUtils::errored(kErrorInvalidState); + + workFlags = (workFlags & kWorkDidSome) ? kWorkNone : kWorkPostponed; + } + + // -------------------------------------------------------------------------- + // Load arguments passed by stack into registers. This is pretty simple and + // it never requires multiple iterations like the previous phase. + // -------------------------------------------------------------------------- + + if (ctx._hasStackSrc) { + uint32_t iterCount = 1; + if (frame.hasDynamicAlignment() && !frame.hasPreservedFP()) + saRegId = saVarId < varCount ? ctx._vars[saVarId].cur.regId() : frame.saRegId(); + + // Base address of all arguments passed by stack. + Mem baseArgPtr = ptr(emitter->gpz(saRegId), int32_t(frame.saOffset(saRegId))); + + for (uint32_t iter = 0; iter < iterCount; iter++) { + for (uint32_t varId = 0; varId < varCount; varId++) { + Var& var = ctx._vars[varId]; + if (var.isDone()) + continue; + + if (var.cur.isStack()) { + ASMJIT_ASSERT(var.out.isReg()); + + uint32_t outId = var.out.regId(); + uint32_t outType = var.out.regType(); + + uint32_t group = Reg::groupOf(outType); + WorkData& wd = ctx._workData[group]; + + if (outId == saRegId && group == BaseReg::kGroupGp) { + // This register will be processed last as we still need `saRegId`. + if (iterCount == 1) { + iterCount++; + continue; + } + wd.unassign(wd._physToVarId[outId], outId); + } + + Reg dstReg = Reg::fromTypeAndId(outType, outId); + Mem srcMem = baseArgPtr.cloneAdjusted(var.cur.stackOffset()); + + ASMJIT_PROPAGATE( + emitArgMove(emitter, + dstReg, var.out.typeId(), + srcMem, var.cur.typeId(), avxEnabled)); + + wd.assign(varId, outId); + var.cur.initReg(outType, outId, var.cur.typeId(), FuncValue::kFlagIsDone); + } + } + } + } + + return kErrorOk; +} + +ASMJIT_END_SUB_NAMESPACE + +#endif // ASMJIT_BUILD_X86 |