1 files changed, 1039 insertions, 0 deletions
diff --git a/client/asmjit/core/ralocal.cpp b/client/asmjit/core/ralocal.cpp
new file mode 100644
index 0000000..e3a8a97
--- /dev/null
+++ b/client/asmjit/core/ralocal.cpp
@@ -0,0 +1,1039 @@
+// AsmJit - Machine code generation for C++
+//
+//  * Official AsmJit Home Page: https://asmjit.com
+//  * Official Github Repository: https://github.com/asmjit/asmjit
+//
+// Copyright (c) 2008-2020 The AsmJit Authors
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgment in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#include "../core/api-build_p.h"
+#ifndef ASMJIT_NO_COMPILER
+
+#include "../core/ralocal_p.h"
+#include "../core/support.h"
+
+ASMJIT_BEGIN_NAMESPACE
+
+// ============================================================================
+// [asmjit::RALocalAllocator - Utilities]
+// ============================================================================
+
+static ASMJIT_INLINE RATiedReg* RALocal_findTiedRegByWorkId(RATiedReg* tiedRegs, size_t count, uint32_t workId) noexcept {
+  for (size_t i = 0; i < count; i++)
+    if (tiedRegs[i].workId() == workId)
+      return &tiedRegs[i];
+  return nullptr;
+}
+
+// ============================================================================
+// [asmjit::RALocalAllocator - Init / Reset]
+// ============================================================================
+
+Error RALocalAllocator::init() noexcept {
+  PhysToWorkMap* physToWorkMap;
+  WorkToPhysMap* workToPhysMap;
+
+  physToWorkMap = _pass->newPhysToWorkMap();
+  workToPhysMap = _pass->newWorkToPhysMap();
+  if (!physToWorkMap || !workToPhysMap)
+    return DebugUtils::errored(kErrorOutOfMemory);
+
+  _curAssignment.initLayout(_pass->_physRegCount, _pass->workRegs());
+  _curAssignment.initMaps(physToWorkMap, workToPhysMap);
+
+  physToWorkMap = _pass->newPhysToWorkMap();
+  workToPhysMap = _pass->newWorkToPhysMap();
+  if (!physToWorkMap || !workToPhysMap)
+    return DebugUtils::errored(kErrorOutOfMemory);
+
+  _tmpAssignment.initLayout(_pass->_physRegCount, _pass->workRegs());
+  _tmpAssignment.initMaps(physToWorkMap, workToPhysMap);
+
+  return kErrorOk;
+}
+
+// ============================================================================
+// [asmjit::RALocalAllocator - Assignment]
+// ============================================================================
+
+Error RALocalAllocator::makeInitialAssignment() noexcept {
+  FuncNode* func = _pass->func();
+  RABlock* entry = _pass->entryBlock();
+
+  ZoneBitVector& liveIn = entry->liveIn();
+  uint32_t argCount = func->argCount();
+  uint32_t numIter = 1;
+
+  for (uint32_t iter = 0; iter < numIter; iter++) {
+    for (uint32_t i = 0; i < argCount; i++) {
+      // Unassigned argument.
+      VirtReg* virtReg = func->arg(i);
+      if (!virtReg) continue;
+
+      // Unreferenced argument.
+      RAWorkReg* workReg = virtReg->workReg();
+      if (!workReg) continue;
+
+      // Overwritten argument.
+      uint32_t workId = workReg->workId();
+      if (!liveIn.bitAt(workId))
+        continue;
+
+      uint32_t group = workReg->group();
+      if (_curAssignment.workToPhysId(group, workId) != RAAssignment::kPhysNone)
+        continue;
+
+      uint32_t allocableRegs = _availableRegs[group] & ~_curAssignment.assigned(group);
+      if (iter == 0) {
+        // First iteration: Try to allocate to home RegId.
+        if (workReg->hasHomeRegId()) {
+          uint32_t physId = workReg->homeRegId();
+          if (Support::bitTest(allocableRegs, physId)) {
+            _curAssignment.assign(group, workId, physId, true);
+            _pass->_argsAssignment.assignReg(i, workReg->info().type(), physId, workReg->typeId());
+            continue;
+          }
+        }
+
+        numIter = 2;
+      }
+      else {
+        // Second iteration: Pick any other register if the is an unassigned one or assign to stack.
+        if (allocableRegs) {
+          uint32_t physId = Support::ctz(allocableRegs);
+          _curAssignment.assign(group, workId, physId, true);
+          _pass->_argsAssignment.assignReg(i, workReg->info().type(), physId, workReg->typeId());
+        }
+        else {
+          // This register will definitely need stack, create the slot now and assign also `argIndex`
+          // to it. We will patch `_argsAssignment` later after RAStackAllocator finishes.
+          RAStackSlot* slot = _pass->getOrCreateStackSlot(workReg);
+          if (ASMJIT_UNLIKELY(!slot))
+            return DebugUtils::errored(kErrorOutOfMemory);
+
+          // This means STACK_ARG may be moved to STACK.
+          workReg->addFlags(RAWorkReg::kFlagStackArgToStack);
+          _pass->_numStackArgsToStackSlots++;
+        }
+      }
+    }
+  }
+
+  return kErrorOk;
+}
+
+Error RALocalAllocator::replaceAssignment(
+  const PhysToWorkMap* physToWorkMap,
+  const WorkToPhysMap* workToPhysMap) noexcept {
+
+  _curAssignment.copyFrom(physToWorkMap, workToPhysMap);
+  return kErrorOk;
+}
+
+Error RALocalAllocator::switchToAssignment(
+  PhysToWorkMap* dstPhysToWorkMap,
+  WorkToPhysMap* dstWorkToPhysMap,
+  const ZoneBitVector& liveIn,
+  bool dstReadOnly,
+  bool tryMode) noexcept {
+
+  RAAssignment dst;
+  RAAssignment& cur = _curAssignment;
+
+  dst.initLayout(_pass->_physRegCount, _pass->workRegs());
+  dst.initMaps(dstPhysToWorkMap, dstWorkToPhysMap);
+
+  if (tryMode)
+    return kErrorOk;
+
+  for (uint32_t group = 0; group < BaseReg::kGroupVirt; group++) {
+    // ------------------------------------------------------------------------
+    // STEP 1:
+    //   - KILL all registers that are not live at `dst`,
+    //   - SPILL all registers that are not assigned at `dst`.
+    // ------------------------------------------------------------------------
+
+    if (!tryMode) {
+      Support::BitWordIterator<uint32_t> it(cur.assigned(group));
+      while (it.hasNext()) {
+        uint32_t physId = it.next();
+        uint32_t workId = cur.physToWorkId(group, physId);
+
+        // Must be true as we iterate over assigned registers.
+        ASMJIT_ASSERT(workId != RAAssignment::kWorkNone);
+
+        // KILL if it's not live on entry.
+        if (!liveIn.bitAt(workId)) {
+          onKillReg(group, workId, physId);
+          continue;
+        }
+
+        // SPILL if it's not assigned on entry.
+        uint32_t altId = dst.workToPhysId(group, workId);
+        if (altId == RAAssignment::kPhysNone) {
+          ASMJIT_PROPAGATE(onSpillReg(group, workId, physId));
+        }
+      }
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 2:
+    //   - MOVE and SWAP registers from their current assignments into their
+    //     DST assignments.
+    //   - Build `willLoadRegs` mask of registers scheduled for `onLoadReg()`.
+    // ------------------------------------------------------------------------
+
+    // Current run-id (1 means more aggressive decisions).
+    int32_t runId = -1;
+    // Remaining registers scheduled for `onLoadReg()`.
+    uint32_t willLoadRegs = 0;
+    // Remaining registers to be allocated in this loop.
+    uint32_t affectedRegs = dst.assigned(group);
+
+    while (affectedRegs) {
+      if (++runId == 2) {
+        if (!tryMode)
+          return DebugUtils::errored(kErrorInvalidState);
+
+        // Stop in `tryMode` if we haven't done anything in past two rounds.
+        break;
+      }
+
+      Support::BitWordIterator<uint32_t> it(affectedRegs);
+      while (it.hasNext()) {
+        uint32_t physId = it.next();
+        uint32_t physMask = Support::bitMask(physId);
+
+        uint32_t curWorkId = cur.physToWorkId(group, physId);
+        uint32_t dstWorkId = dst.physToWorkId(group, physId);
+
+        // The register must have assigned `dstWorkId` as we only iterate over assigned regs.
+        ASMJIT_ASSERT(dstWorkId != RAAssignment::kWorkNone);
+
+        if (curWorkId != RAAssignment::kWorkNone) {
+          // Both assigned.
+          if (curWorkId != dstWorkId) {
+            // Wait a bit if this is the first run, we may avoid this if `curWorkId` moves out.
+            if (runId <= 0)
+              continue;
+
+            uint32_t altPhysId = cur.workToPhysId(group, dstWorkId);
+            if (altPhysId == RAAssignment::kPhysNone)
+              continue;
+
+            // Reset as we will do some changes to the current assignment.
+            runId = -1;
+
+            if (_archTraits.hasSwap(group)) {
+              ASMJIT_PROPAGATE(onSwapReg(group, curWorkId, physId, dstWorkId, altPhysId));
+            }
+            else {
+              // SPILL the reg if it's not dirty in DST, otherwise try to MOVE.
+              if (!cur.isPhysDirty(group, physId)) {
+                ASMJIT_PROPAGATE(onKillReg(group, curWorkId, physId));
+              }
+              else {
+                uint32_t allocableRegs = _pass->_availableRegs[group] & ~cur.assigned(group);
+
+                // If possible don't conflict with assigned regs at DST.
+                if (allocableRegs & ~dst.assigned(group))
+                  allocableRegs &= ~dst.assigned(group);
+
+                if (allocableRegs) {
+                  // MOVE is possible, thus preferred.
+                  uint32_t tmpPhysId = Support::ctz(allocableRegs);
+
+                  ASMJIT_PROPAGATE(onMoveReg(group, curWorkId, tmpPhysId, physId));
+                  _pass->_clobberedRegs[group] |= Support::bitMask(tmpPhysId);
+                }
+                else {
+                  // MOVE is impossible, must SPILL.
+                  ASMJIT_PROPAGATE(onSpillReg(group, curWorkId, physId));
+                }
+              }
+
+              goto Cleared;
+            }
+          }
+        }
+        else {
+Cleared:
+          // DST assigned, CUR unassigned.
+          uint32_t altPhysId = cur.workToPhysId(group, dstWorkId);
+          if (altPhysId == RAAssignment::kPhysNone) {
+            if (liveIn.bitAt(dstWorkId))
+              willLoadRegs |= physMask; // Scheduled for `onLoadReg()`.
+            affectedRegs &= ~physMask;  // Unaffected from now.
+            continue;
+          }
+          ASMJIT_PROPAGATE(onMoveReg(group, dstWorkId, physId, altPhysId));
+        }
+
+        // Both DST and CUR assigned to the same reg or CUR just moved to DST.
+        if ((dst.dirty(group) & physMask) != (cur.dirty(group) & physMask)) {
+          if ((dst.dirty(group) & physMask) == 0) {
+            // CUR dirty, DST not dirty (the assert is just to visualize the condition).
+            ASMJIT_ASSERT(!dst.isPhysDirty(group, physId) && cur.isPhysDirty(group, physId));
+
+            // If `dstReadOnly` is true it means that that block was already
+            // processed and we cannot change from CLEAN to DIRTY. In that case
+            // the register has to be saved as it cannot enter the block DIRTY.
+            if (dstReadOnly)
+              ASMJIT_PROPAGATE(onSaveReg(group, dstWorkId, physId));
+            else
+              dst.makeDirty(group, dstWorkId, physId);
+          }
+          else {
+            // DST dirty, CUR not dirty (the assert is just to visualize the condition).
+            ASMJIT_ASSERT(dst.isPhysDirty(group, physId) && !cur.isPhysDirty(group, physId));
+
+            cur.makeDirty(group, dstWorkId, physId);
+          }
+        }
+
+        // Must match now...
+        ASMJIT_ASSERT(dst.physToWorkId(group, physId) == cur.physToWorkId(group, physId));
+        ASMJIT_ASSERT(dst.isPhysDirty(group, physId) == cur.isPhysDirty(group, physId));
+
+        runId = -1;
+        affectedRegs &= ~physMask;
+      }
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 3:
+    //   - Load registers specified by `willLoadRegs`.
+    // ------------------------------------------------------------------------
+
+    {
+      Support::BitWordIterator<uint32_t> it(willLoadRegs);
+      while (it.hasNext()) {
+        uint32_t physId = it.next();
+
+        if (!cur.isPhysAssigned(group, physId)) {
+          uint32_t workId = dst.physToWorkId(group, physId);
+
+          // The algorithm is broken if it tries to load a register that is not in LIVE-IN.
+          ASMJIT_ASSERT(liveIn.bitAt(workId) == true);
+
+          ASMJIT_PROPAGATE(onLoadReg(group, workId, physId));
+          if (dst.isPhysDirty(group, physId))
+            cur.makeDirty(group, workId, physId);
+          ASMJIT_ASSERT(dst.isPhysDirty(group, physId) == cur.isPhysDirty(group, physId));
+        }
+        else {
+          // Not possible otherwise.
+          ASMJIT_ASSERT(tryMode == true);
+        }
+      }
+    }
+  }
+
+  if (!tryMode) {
+    // Hre is a code that dumps the conflicting part if something fails here:
+    //   if (!dst.equals(cur)) {
+    //     uint32_t physTotal = dst._layout.physTotal;
+    //     uint32_t workCount = dst._layout.workCount;
+    //
+    //     for (uint32_t physId = 0; physId < physTotal; physId++) {
+    //       uint32_t dstWorkId = dst._physToWorkMap->workIds[physId];
+    //       uint32_t curWorkId = cur._physToWorkMap->workIds[physId];
+    //       if (dstWorkId != curWorkId)
+    //         fprintf(stderr, "[PhysIdWork] PhysId=%u WorkId[DST(%u) != CUR(%u)]\n", physId, dstWorkId, curWorkId);
+    //     }
+    //
+    //     for (uint32_t workId = 0; workId < workCount; workId++) {
+    //       uint32_t dstPhysId = dst._workToPhysMap->physIds[workId];
+    //       uint32_t curPhysId = cur._workToPhysMap->physIds[workId];
+    //       if (dstPhysId != curPhysId)
+    //         fprintf(stderr, "[WorkToPhys] WorkId=%u PhysId[DST(%u) != CUR(%u)]\n", workId, dstPhysId, curPhysId);
+    //     }
+    //   }
+    ASMJIT_ASSERT(dst.equals(cur));
+  }
+
+  return kErrorOk;
+}
+
+Error RALocalAllocator::spillScratchGpRegsBeforeEntry(uint32_t scratchRegs) noexcept {
+  uint32_t group = BaseReg::kGroupGp;
+  Support::BitWordIterator<uint32_t> it(scratchRegs);
+
+  while (it.hasNext()) {
+    uint32_t physId = it.next();
+    if (_curAssignment.isPhysAssigned(group, physId)) {
+      uint32_t workId = _curAssignment.physToWorkId(group, physId);
+      ASMJIT_PROPAGATE(onSpillReg(group, workId, physId));
+    }
+  }
+
+  return kErrorOk;
+}
+
+// ============================================================================
+// [asmjit::RALocalAllocator - Allocation]
+// ============================================================================
+
+Error RALocalAllocator::allocInst(InstNode* node) noexcept {
+  RAInst* raInst = node->passData<RAInst>();
+
+  RATiedReg* outTiedRegs[Globals::kMaxPhysRegs];
+  RATiedReg* dupTiedRegs[Globals::kMaxPhysRegs];
+
+  // The cursor must point to the previous instruction for a possible instruction insertion.
+  _cc->_setCursor(node->prev());
+
+  _node = node;
+  _raInst = raInst;
+  _tiedTotal = raInst->_tiedTotal;
+  _tiedCount = raInst->_tiedCount;
+
+  // Whether we already replaced register operand with memory operand.
+  bool rmAllocated = false;
+
+  for (uint32_t group = 0; group < BaseReg::kGroupVirt; group++) {
+    uint32_t i, count = this->tiedCount(group);
+    RATiedReg* tiedRegs = this->tiedRegs(group);
+
+    uint32_t willUse = _raInst->_usedRegs[group];
+    uint32_t willOut = _raInst->_clobberedRegs[group];
+    uint32_t willFree = 0;
+    uint32_t usePending = count;
+
+    uint32_t outTiedCount = 0;
+    uint32_t dupTiedCount = 0;
+
+    // ------------------------------------------------------------------------
+    // STEP 1:
+    //
+    // Calculate `willUse` and `willFree` masks based on tied registers we have.
+    //
+    // We don't do any assignment decisions at this stage as we just need to
+    // collect some information first. Then, after we populate all masks needed
+    // we can finally make some decisions in the second loop. The main reason
+    // for this is that we really need `willFree` to make assignment decisions
+    // for `willUse`, because if we mark some registers that will be freed, we
+    // can consider them in decision making afterwards.
+    // ------------------------------------------------------------------------
+
+    for (i = 0; i < count; i++) {
+      RATiedReg* tiedReg = &tiedRegs[i];
+
+      // Add OUT and KILL to `outPending` for CLOBBERing and/or OUT assignment.
+      if (tiedReg->isOutOrKill())
+        outTiedRegs[outTiedCount++] = tiedReg;
+
+      if (tiedReg->isDuplicate())
+        dupTiedRegs[dupTiedCount++] = tiedReg;
+
+      if (!tiedReg->isUse()) {
+        tiedReg->markUseDone();
+        usePending--;
+        continue;
+      }
+
+      uint32_t workId = tiedReg->workId();
+      uint32_t assignedId = _curAssignment.workToPhysId(group, workId);
+
+      if (tiedReg->hasUseId()) {
+        // If the register has `useId` it means it can only be allocated in that register.
+        uint32_t useMask = Support::bitMask(tiedReg->useId());
+
+        // RAInstBuilder must have collected `usedRegs` on-the-fly.
+        ASMJIT_ASSERT((willUse & useMask) != 0);
+
+        if (assignedId == tiedReg->useId()) {
+          // If the register is already allocated in this one, mark it done and continue.
+          tiedReg->markUseDone();
+          if (tiedReg->isWrite())
+            _curAssignment.makeDirty(group, workId, assignedId);
+          usePending--;
+          willUse |= useMask;
+        }
+        else {
+          willFree |= useMask & _curAssignment.assigned(group);
+        }
+      }
+      else {
+        // Check if the register must be moved to `allocableRegs`.
+        uint32_t allocableRegs = tiedReg->allocableRegs();
+        if (assignedId != RAAssignment::kPhysNone) {
+          uint32_t assignedMask = Support::bitMask(assignedId);
+          if ((allocableRegs & ~willUse) & assignedMask) {
+            tiedReg->setUseId(assignedId);
+            tiedReg->markUseDone();
+            if (tiedReg->isWrite())
+              _curAssignment.makeDirty(group, workId, assignedId);
+            usePending--;
+            willUse |= assignedMask;
+          }
+          else {
+            willFree |= assignedMask;
+          }
+        }
+      }
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 2:
+    //
+    // Do some decision making to find the best candidates of registers that
+    // need to be assigned, moved, and/or spilled. Only USE registers are
+    // considered here, OUT will be decided later after all CLOBBERed and OUT
+    // registers are unassigned.
+    // ------------------------------------------------------------------------
+
+    if (usePending) {
+      // TODO: Not sure `liveRegs` should be used, maybe willUse and willFree would be enough and much more clear.
+
+      // All registers that are currently alive without registers that will be freed.
+      uint32_t liveRegs = _curAssignment.assigned(group) & ~willFree;
+
+      for (i = 0; i < count; i++) {
+        RATiedReg* tiedReg = &tiedRegs[i];
+        if (tiedReg->isUseDone()) continue;
+
+        uint32_t workId = tiedReg->workId();
+        uint32_t assignedId = _curAssignment.workToPhysId(group, workId);
+
+        // REG/MEM: Patch register operand to memory operand if not allocated.
+        if (!rmAllocated && tiedReg->hasUseRM()) {
+          if (assignedId == RAAssignment::kPhysNone && Support::isPowerOf2(tiedReg->useRewriteMask())) {
+            RAWorkReg* workReg = workRegById(tiedReg->workId());
+            uint32_t opIndex = Support::ctz(tiedReg->useRewriteMask()) / uint32_t(sizeof(Operand) / sizeof(uint32_t));
+            uint32_t rmSize = tiedReg->rmSize();
+
+            if (rmSize <= workReg->virtReg()->virtSize()) {
+              Operand& op = node->operands()[opIndex];
+              op = _pass->workRegAsMem(workReg);
+              op.as<BaseMem>().setSize(rmSize);
+              tiedReg->_useRewriteMask = 0;
+
+              tiedReg->markUseDone();
+              usePending--;
+
+              rmAllocated = true;
+              continue;
+            }
+          }
+        }
+
+        if (!tiedReg->hasUseId()) {
+          uint32_t allocableRegs = tiedReg->allocableRegs() & ~(willFree | willUse);
+
+          // DECIDE where to assign the USE register.
+          uint32_t useId = decideOnAssignment(group, workId, assignedId, allocableRegs);
+          uint32_t useMask = Support::bitMask(useId);
+
+          willUse |= useMask;
+          willFree |= useMask & liveRegs;
+          tiedReg->setUseId(useId);
+
+          if (assignedId != RAAssignment::kPhysNone) {
+            uint32_t assignedMask = Support::bitMask(assignedId);
+
+            willFree |= assignedMask;
+            liveRegs &= ~assignedMask;
+
+            // OPTIMIZATION: Assign the USE register here if it's possible.
+            if (!(liveRegs & useMask)) {
+              ASMJIT_PROPAGATE(onMoveReg(group, workId, useId, assignedId));
+              tiedReg->markUseDone();
+              if (tiedReg->isWrite())
+                _curAssignment.makeDirty(group, workId, useId);
+              usePending--;
+            }
+          }
+          else {
+            // OPTIMIZATION: Assign the USE register here if it's possible.
+            if (!(liveRegs & useMask)) {
+              ASMJIT_PROPAGATE(onLoadReg(group, workId, useId));
+              tiedReg->markUseDone();
+              if (tiedReg->isWrite())
+                _curAssignment.makeDirty(group, workId, useId);
+              usePending--;
+            }
+          }
+
+          liveRegs |= useMask;
+        }
+      }
+    }
+
+    // Initially all used regs will be marked clobbered.
+    uint32_t clobberedByInst = willUse | willOut;
+
+    // ------------------------------------------------------------------------
+    // STEP 3:
+    //
+    // Free all registers that we marked as `willFree`. Only registers that are not
+    // USEd by the instruction are considered as we don't want to free regs we need.
+    // ------------------------------------------------------------------------
+
+    if (willFree) {
+      uint32_t allocableRegs = _availableRegs[group] & ~(_curAssignment.assigned(group) | willFree | willUse | willOut);
+      Support::BitWordIterator<uint32_t> it(willFree);
+
+      do {
+        uint32_t assignedId = it.next();
+        if (_curAssignment.isPhysAssigned(group, assignedId)) {
+          uint32_t workId = _curAssignment.physToWorkId(group, assignedId);
+
+          // DECIDE whether to MOVE or SPILL.
+          if (allocableRegs) {
+            uint32_t reassignedId = decideOnReassignment(group, workId, assignedId, allocableRegs);
+            if (reassignedId != RAAssignment::kPhysNone) {
+              ASMJIT_PROPAGATE(onMoveReg(group, workId, reassignedId, assignedId));
+              allocableRegs ^= Support::bitMask(reassignedId);
+              continue;
+            }
+          }
+
+          ASMJIT_PROPAGATE(onSpillReg(group, workId, assignedId));
+        }
+      } while (it.hasNext());
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 4:
+    //
+    // ALLOCATE / SHUFFLE all registers that we marked as `willUse` and weren't
+    // allocated yet. This is a bit complicated as the allocation is iterative.
+    // In some cases we have to wait before allocating a particual physical
+    // register as it's still occupied by some other one, which we need to move
+    // before we can use it. In this case we skip it and allocate another some
+    // other instead (making it free for another iteration).
+    //
+    // NOTE: Iterations are mostly important for complicated allocations like
+    // function calls, where there can be up to N registers used at once. Asm
+    // instructions won't run the loop more than once in 99.9% of cases as they
+    // use 2..3 registers in average.
+    // ------------------------------------------------------------------------
+
+    if (usePending) {
+      bool mustSwap = false;
+      do {
+        uint32_t oldPending = usePending;
+
+        for (i = 0; i < count; i++) {
+          RATiedReg* thisTiedReg = &tiedRegs[i];
+          if (thisTiedReg->isUseDone()) continue;
+
+          uint32_t thisWorkId = thisTiedReg->workId();
+          uint32_t thisPhysId = _curAssignment.workToPhysId(group, thisWorkId);
+
+          // This would be a bug, fatal one!
+          uint32_t targetPhysId = thisTiedReg->useId();
+          ASMJIT_ASSERT(targetPhysId != thisPhysId);
+
+          uint32_t targetWorkId = _curAssignment.physToWorkId(group, targetPhysId);
+          if (targetWorkId != RAAssignment::kWorkNone) {
+            RAWorkReg* targetWorkReg = workRegById(targetWorkId);
+
+            // Swapping two registers can solve two allocation tasks by emitting
+            // just a single instruction. However, swap is only available on few
+            // architectures and it's definitely not available for each register
+            // group. Calling `onSwapReg()` before checking these would be fatal.
+            if (_archTraits.hasSwap(group) && thisPhysId != RAAssignment::kPhysNone) {
+              ASMJIT_PROPAGATE(onSwapReg(group, thisWorkId, thisPhysId, targetWorkId, targetPhysId));
+
+              thisTiedReg->markUseDone();
+              if (thisTiedReg->isWrite())
+                _curAssignment.makeDirty(group, thisWorkId, targetPhysId);
+              usePending--;
+
+              // Double-hit.
+              RATiedReg* targetTiedReg = RALocal_findTiedRegByWorkId(tiedRegs, count, targetWorkReg->workId());
+              if (targetTiedReg && targetTiedReg->useId() == thisPhysId) {
+                targetTiedReg->markUseDone();
+                if (targetTiedReg->isWrite())
+                  _curAssignment.makeDirty(group, targetWorkId, thisPhysId);
+                usePending--;
+              }
+              continue;
+            }
+
+            if (!mustSwap)
+              continue;
+
+            // Only branched here if the previous iteration did nothing. This is
+            // essentially a SWAP operation without having a dedicated instruction
+            // for that purpose (vector registers, etc). The simplest way to
+            // handle such case is to SPILL the target register.
+            ASMJIT_PROPAGATE(onSpillReg(group, targetWorkId, targetPhysId));
+          }
+
+          if (thisPhysId != RAAssignment::kPhysNone) {
+            ASMJIT_PROPAGATE(onMoveReg(group, thisWorkId, targetPhysId, thisPhysId));
+
+            thisTiedReg->markUseDone();
+            if (thisTiedReg->isWrite())
+              _curAssignment.makeDirty(group, thisWorkId, targetPhysId);
+            usePending--;
+          }
+          else {
+            ASMJIT_PROPAGATE(onLoadReg(group, thisWorkId, targetPhysId));
+
+            thisTiedReg->markUseDone();
+            if (thisTiedReg->isWrite())
+              _curAssignment.makeDirty(group, thisWorkId, targetPhysId);
+            usePending--;
+          }
+        }
+
+        mustSwap = (oldPending == usePending);
+      } while (usePending);
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 5:
+    //
+    // KILL registers marked as KILL/OUT.
+    // ------------------------------------------------------------------------
+
+    uint32_t outPending = outTiedCount;
+    if (outTiedCount) {
+      for (i = 0; i < outTiedCount; i++) {
+        RATiedReg* tiedReg = outTiedRegs[i];
+
+        uint32_t workId = tiedReg->workId();
+        uint32_t physId = _curAssignment.workToPhysId(group, workId);
+
+        // Must check if it's allocated as KILL can be related to OUT (like KILL
+        // immediately after OUT, which could mean the register is not assigned).
+        if (physId != RAAssignment::kPhysNone) {
+          ASMJIT_PROPAGATE(onKillReg(group, workId, physId));
+          willOut &= ~Support::bitMask(physId);
+        }
+
+        // We still maintain number of pending registers for OUT assignment.
+        // So, if this is only KILL, not OUT, we can safely decrement it.
+        outPending -= !tiedReg->isOut();
+      }
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 6:
+    //
+    // SPILL registers that will be CLOBBERed. Since OUT and KILL were
+    // already processed this is used mostly to handle function CALLs.
+    // ------------------------------------------------------------------------
+
+    if (willOut) {
+      Support::BitWordIterator<uint32_t> it(willOut);
+      do {
+        uint32_t physId = it.next();
+        uint32_t workId = _curAssignment.physToWorkId(group, physId);
+
+        if (workId == RAAssignment::kWorkNone)
+          continue;
+
+        ASMJIT_PROPAGATE(onSpillReg(group, workId, physId));
+      } while (it.hasNext());
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 7:
+    //
+    // Duplication.
+    // ------------------------------------------------------------------------
+
+    for (i = 0; i < dupTiedCount; i++) {
+      RATiedReg* tiedReg = dupTiedRegs[i];
+      uint32_t workId = tiedReg->workId();
+      uint32_t srcId = tiedReg->useId();
+
+      Support::BitWordIterator<uint32_t> it(tiedReg->_allocableRegs);
+      while (it.hasNext()) {
+        uint32_t dstId = it.next();
+        if (dstId == srcId)
+          continue;
+        _pass->onEmitMove(workId, dstId, srcId);
+      }
+    }
+
+    // ------------------------------------------------------------------------
+    // STEP 8:
+    //
+    // Assign OUT registers.
+    // ------------------------------------------------------------------------
+
+    if (outPending) {
+      // Live registers, we need a separate variable (outside of `_curAssignment)
+      // to hold these because of KILLed registers. If we KILL a register here it
+      // will go out from `_curAssignment`, but we cannot assign to it in here.
+      uint32_t liveRegs = _curAssignment.assigned(group);
+
+      // Must avoid as they have been already OUTed (added during the loop).
+      uint32_t outRegs = 0;
+
+      // Must avoid as they collide with already allocated ones.
+      uint32_t avoidRegs = willUse & ~clobberedByInst;
+
+      for (i = 0; i < outTiedCount; i++) {
+        RATiedReg* tiedReg = outTiedRegs[i];
+        if (!tiedReg->isOut()) continue;
+
+        uint32_t workId = tiedReg->workId();
+        uint32_t assignedId = _curAssignment.workToPhysId(group, workId);
+
+        if (assignedId != RAAssignment::kPhysNone)
+          ASMJIT_PROPAGATE(onKillReg(group, workId, assignedId));
+
+        uint32_t physId = tiedReg->outId();
+        if (physId == RAAssignment::kPhysNone) {
+          uint32_t allocableRegs = _availableRegs[group] & ~(outRegs | avoidRegs);
+
+          if (!(allocableRegs & ~liveRegs)) {
+            // There are no more registers, decide which one to spill.
+            uint32_t spillWorkId;
+            physId = decideOnSpillFor(group, workId, allocableRegs & liveRegs, &spillWorkId);
+            ASMJIT_PROPAGATE(onSpillReg(group, spillWorkId, physId));
+          }
+          else {
+            physId = decideOnAssignment(group, workId, RAAssignment::kPhysNone, allocableRegs & ~liveRegs);
+          }
+        }
+
+        // OUTs are CLOBBERed thus cannot be ASSIGNed right now.
+        ASMJIT_ASSERT(!_curAssignment.isPhysAssigned(group, physId));
+
+        if (!tiedReg->isKill())
+          ASMJIT_PROPAGATE(onAssignReg(group, workId, physId, true));
+
+        tiedReg->setOutId(physId);
+        tiedReg->markOutDone();
+
+        outRegs |= Support::bitMask(physId);
+        liveRegs &= ~Support::bitMask(physId);
+        outPending--;
+      }
+
+      clobberedByInst |= outRegs;
+      ASMJIT_ASSERT(outPending == 0);
+    }
+
+    _clobberedRegs[group] |= clobberedByInst;
+  }
+
+  return kErrorOk;
+}
+
+Error RALocalAllocator::spillAfterAllocation(InstNode* node) noexcept {
+  // This is experimental feature that would spill registers that don't have
+  // home-id and are last in this basic block. This prevents saving these regs
+  // in other basic blocks and then restoring them (mostly relevant for loops).
+  RAInst* raInst = node->passData<RAInst>();
+  uint32_t count = raInst->tiedCount();
+
+  for (uint32_t i = 0; i < count; i++) {
+    RATiedReg* tiedReg = raInst->tiedAt(i);
+    if (tiedReg->isLast()) {
+      uint32_t workId = tiedReg->workId();
+      RAWorkReg* workReg = workRegById(workId);
+      if (!workReg->hasHomeRegId()) {
+        uint32_t group = workReg->group();
+        uint32_t assignedId = _curAssignment.workToPhysId(group, workId);
+        if (assignedId != RAAssignment::kPhysNone) {
+          _cc->_setCursor(node);
+          ASMJIT_PROPAGATE(onSpillReg(group, workId, assignedId));
+        }
+      }
+    }
+  }
+
+  return kErrorOk;
+}
+
+Error RALocalAllocator::allocBranch(InstNode* node, RABlock* target, RABlock* cont) noexcept {
+  // TODO: This should be used to make the branch allocation better.
+  DebugUtils::unused(cont);
+
+  // The cursor must point to the previous instruction for a possible instruction insertion.
+  _cc->_setCursor(node->prev());
+
+  // Use TryMode of `switchToAssignment()` if possible.
+  if (target->hasEntryAssignment()) {
+    ASMJIT_PROPAGATE(switchToAssignment(
+      target->entryPhysToWorkMap(),
+      target->entryWorkToPhysMap(),
+      target->liveIn(),
+      target->isAllocated(),
+      true));
+  }
+
+  ASMJIT_PROPAGATE(allocInst(node));
+  ASMJIT_PROPAGATE(spillRegsBeforeEntry(target));
+
+  if (target->hasEntryAssignment()) {
+    BaseNode* injectionPoint = _pass->extraBlock()->prev();
+    BaseNode* prevCursor = _cc->setCursor(injectionPoint);
+
+    _tmpAssignment.copyFrom(_curAssignment);
+    ASMJIT_PROPAGATE(switchToAssignment(
+      target->entryPhysToWorkMap(),
+      target->entryWorkToPhysMap(),
+      target->liveIn(),
+      target->isAllocated(),
+      false));
+
+    BaseNode* curCursor = _cc->cursor();
+    if (curCursor != injectionPoint) {
+      // Additional instructions emitted to switch from the current state to
+      // the `target` state. This means that we have to move these instructions
+      // into an independent code block and patch the jump location.
+      Operand& targetOp = node->op(node->opCount() - 1);
+      if (ASMJIT_UNLIKELY(!targetOp.isLabel()))
+        return DebugUtils::errored(kErrorInvalidState);
+
+      Label trampoline = _cc->newLabel();
+      Label savedTarget = targetOp.as<Label>();
+
+      // Patch `target` to point to the `trampoline` we just created.
+      targetOp = trampoline;
+
+      // Clear a possible SHORT form as we have no clue now if the SHORT form would
+      // be encodable after patching the target to `trampoline` (X86 specific).
+      node->clearInstOptions(BaseInst::kOptionShortForm);
+
+      // Finalize the switch assignment sequence.
+      ASMJIT_PROPAGATE(_pass->onEmitJump(savedTarget));
+      _cc->_setCursor(injectionPoint);
+      _cc->bind(trampoline);
+    }
+
+    _cc->_setCursor(prevCursor);
+    _curAssignment.swap(_tmpAssignment);
+  }
+  else {
+    ASMJIT_PROPAGATE(_pass->setBlockEntryAssignment(target, block(), _curAssignment));
+  }
+
+  return kErrorOk;
+}
+
+Error RALocalAllocator::allocJumpTable(InstNode* node, const RABlocks& targets, RABlock* cont) noexcept {
+  if (targets.empty())
+    return DebugUtils::errored(kErrorInvalidState);
+
+  // The cursor must point to the previous instruction for a possible instruction insertion.
+  _cc->_setCursor(node->prev());
+
+  // All `targets` should have the same sharedAssignmentId, we just read the first.
+  RABlock* anyTarget = targets[0];
+  if (!anyTarget->hasSharedAssignmentId())
+    return DebugUtils::errored(kErrorInvalidState);
+
+  RASharedAssignment& sharedAssignment = _pass->_sharedAssignments[anyTarget->sharedAssignmentId()];
+
+  ASMJIT_PROPAGATE(allocInst(node));
+
+  if (!sharedAssignment.empty()) {
+    ASMJIT_PROPAGATE(switchToAssignment(
+      sharedAssignment.physToWorkMap(),
+      sharedAssignment.workToPhysMap(),
+      sharedAssignment.liveIn(),
+      true,  // Read-only.
+      false  // Try-mode.
+    ));
+  }
+
+  ASMJIT_PROPAGATE(spillRegsBeforeEntry(anyTarget));
+
+  if (sharedAssignment.empty()) {
+    ASMJIT_PROPAGATE(_pass->setBlockEntryAssignment(anyTarget, block(), _curAssignment));
+  }
+
+  return kErrorOk;
+}
+
+// ============================================================================
+// [asmjit::RALocalAllocator - Decision Making]
+// ============================================================================
+
+uint32_t RALocalAllocator::decideOnAssignment(uint32_t group, uint32_t workId, uint32_t physId, uint32_t allocableRegs) const noexcept {
+  ASMJIT_ASSERT(allocableRegs != 0);
+  DebugUtils::unused(group, physId);
+
+  RAWorkReg* workReg = workRegById(workId);
+
+  // Prefer home register id, if possible.
+  if (workReg->hasHomeRegId()) {
+    uint32_t homeId = workReg->homeRegId();
+    if (Support::bitTest(allocableRegs, homeId))
+      return homeId;
+  }
+
+  // Prefer registers used upon block entries.
+  uint32_t previouslyAssignedRegs = workReg->allocatedMask();
+  if (allocableRegs & previouslyAssignedRegs)
+    allocableRegs &= previouslyAssignedRegs;
+
+  return Support::ctz(allocableRegs);
+}
+
+uint32_t RALocalAllocator::decideOnReassignment(uint32_t group, uint32_t workId, uint32_t physId, uint32_t allocableRegs) const noexcept {
+  ASMJIT_ASSERT(allocableRegs != 0);
+  DebugUtils::unused(group, physId);
+
+  RAWorkReg* workReg = workRegById(workId);
+
+  // Prefer allocating back to HomeId, if possible.
+  if (workReg->hasHomeRegId()) {
+    if (Support::bitTest(allocableRegs, workReg->homeRegId()))
+      return workReg->homeRegId();
+  }
+
+  // TODO: [Register Allocator] This could be improved.
+
+  // Decided to SPILL.
+  return RAAssignment::kPhysNone;
+}
+
+uint32_t RALocalAllocator::decideOnSpillFor(uint32_t group, uint32_t workId, uint32_t spillableRegs, uint32_t* spillWorkId) const noexcept {
+  // May be used in the future to decide which register would be best to spill so `workId` can be assigned.
+  DebugUtils::unused(workId);
+  ASMJIT_ASSERT(spillableRegs != 0);
+
+  Support::BitWordIterator<uint32_t> it(spillableRegs);
+  uint32_t bestPhysId = it.next();
+  uint32_t bestWorkId = _curAssignment.physToWorkId(group, bestPhysId);
+
+  // Avoid calculating the cost model if there is only one spillable register.
+  if (it.hasNext()) {
+    uint32_t bestCost = calculateSpillCost(group, bestWorkId, bestPhysId);
+    do {
+      uint32_t localPhysId = it.next();
+      uint32_t localWorkId = _curAssignment.physToWorkId(group, localPhysId);
+      uint32_t localCost = calculateSpillCost(group, localWorkId, localPhysId);
+
+      if (localCost < bestCost) {
+        bestCost = localCost;
+        bestPhysId = localPhysId;
+        bestWorkId = localWorkId;
+      }
+    } while (it.hasNext());
+  }
+
+  *spillWorkId = bestWorkId;
+  return bestPhysId;
+}
+
+ASMJIT_END_NAMESPACE
+
+#endif // !ASMJIT_NO_COMPILER