/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "../GraphColor.h"
#include "../PointsToAnalysis.h"
#include "LocalScheduler_G4IR.h"
#include "Passes/AccSubstitution.hpp"

// clang-format off
#include "common/LLVMWarningsPush.hpp"
#include "llvm/Support/Allocator.h"
#include "common/LLVMWarningsPop.hpp"
// clang-format on

#include <algorithm>
#include <fstream>
#include <functional>
#include <iostream>
#include <list>
#include <queue>

using namespace vISA;

static const unsigned SMALL_BLOCK_SIZE = 10;
static const unsigned LARGE_BLOCK_SIZE = 20000;
static const unsigned LARGE_BLOCK_SIZE_RPE = 32000;
static const unsigned PRESSURE_REDUCTION_MIN_BENEFIT = 5; // percentage
static const unsigned PRESSURE_REDUCTION_THRESHOLD = 110;
static const unsigned PRESSURE_HIGH_THRESHOLD = 128;
static const unsigned PRESSURE_REDUCTION_THRESHOLD_SIMD32 = 120;

namespace {

// Forward declaration.
class preNode;
struct RegisterPressure;

using preNodeAlloc = llvm::SpecificBumpPtrAllocator<preNode>;

class preEdge {
public:
  preEdge(preNode *N, DepType Ty) : mNode(N), mType(Ty), mLatency(-1) {}

  preNode *getNode() const { return mNode; }
  DepType getType() const { return mType; }
  bool isDataDep() const {
    switch (mType) {
    case DepType::RAW:
    case DepType::WAR:
    case DepType::WAW:
    case DepType::RAW_MEMORY:
    case DepType::WAR_MEMORY:
    case DepType::WAW_MEMORY:
      return true;
    default:
      break;
    }
    return false;
  }

  void setLatency(int L) { mLatency = L; }
  int getLatency() { return isDataDep() ? mLatency : 0; }

private:
  // Node at the end of this edge.
  preNode *mNode;

  // Type of dependence (RAW, WAW, WAR, etc.).
  DepType mType;

  // data-dependence Latency used in Latency scheduling.
  // only exists (i.e. >=0) on succ-edge during latency scheduling.
  // set in LatencyQueue::calculatePriority
  int mLatency;
};

class preNode {
public:
  // A node for an instruction.
  preNode(G4_INST *Inst, unsigned ID) : Inst(Inst), ID(ID) {
    Barrier = checkBarrier(Inst);
  }

  // A special node without attaching to an instruction.
  preNode() { Barrier = checkBarrier(Inst); }

  ~preNode() = default;

  void *operator new(size_t sz, preNodeAlloc &Allocator) {
    return Allocator.Allocate(sz / sizeof(preNode));
  }

  DepType getBarrier() const { return Barrier; }
  void setBarrier(DepType d) { Barrier = d; }
  static DepType checkBarrier(G4_INST *Inst);
  static bool isBarrier(G4_INST *Inst) {
    switch (checkBarrier(Inst)) {
    case DepType::DEP_LABEL:
    case DepType::CONTROL_FLOW_BARRIER:
    case DepType::INDIRECT_ADDR_BARRIER:
    case DepType::MSG_BARRIER:
    case DepType::OPT_BARRIER:
    case DepType::SEND_BARRIER:
      return true;
    default:
      break;
    }
    return false;
  }

  vISA::G4_INST *getInst() const { return Inst; }
  unsigned getID() const { return ID; }

  typedef std::vector<preEdge>::iterator pred_iterator;
  typedef std::vector<preEdge>::const_iterator const_pred_iterator;
  pred_iterator pred_begin() { return Preds.begin(); }
  pred_iterator pred_end() { return Preds.end(); }
  bool pred_empty() const { return Preds.empty(); }
  unsigned pred_size() const { return (unsigned)Preds.size(); }
  std::vector<preEdge> &preds() { return Preds; }

  typedef std::vector<preEdge>::iterator succ_iterator;
  typedef std::vector<preEdge>::const_iterator const_succ_iterator;
  succ_iterator succ_begin() { return Succs.begin(); }
  succ_iterator succ_end() { return Succs.end(); }
  bool succ_empty() const { return Succs.empty(); }
  unsigned succ_size() const { return (unsigned)Succs.size(); }
  std::vector<preEdge> &succs() { return Succs; }

  void setTupleLead(preNode *Lead) {
    TupleLead = Lead;
    if (this == Lead)
      TupleParts = 1;
    else
      ++Lead->TupleParts;
  }
  preNode *getTupleLead() const { return TupleLead; }
  unsigned getTupleParts() const {
    if (this == TupleLead)
      return TupleParts;
    return TupleLead->TupleParts;
  }
  // Used in latency scheduling
  void setReadyCycle(unsigned cyc) { ReadyCycle = cyc; }
  unsigned getReadyCycle() { return ReadyCycle; }
  // Used in ACC scheduling
  void setACCCandidate() { ACCCandidate = true; }
  bool isACCCandidate() { return ACCCandidate; }

  void print(std::ostream &os) const;
  void dump() const;

private:
  /* The following data shall not be overwritten by a scheduler. */
  std::vector<preEdge> Succs;
  std::vector<preEdge> Preds;

  // The corresponding instruction to this node.
  G4_INST *Inst = nullptr;

  // The unique node ID.
  unsigned ID = 0xFFFFFFFF;

  // Indicates whether this node is a barrier (NONE, SEND, CONTROL)
  DepType Barrier;

  /* The following data may be overwritten by a scheduler. */

  // Tuple node, which should be schedule in pair with this node.
  preNode *TupleLead = nullptr;
  unsigned TupleParts = 0;

  // # of preds not scheduled.
  unsigned NumPredsLeft = 0;

  // # of succs not scheduled.
  unsigned NumSuccsLeft = 0;

  // the earliest cycle for latency scheduling
  unsigned ReadyCycle = 0;

  // True once scheduled.
  bool isScheduled = false;
  bool isClustered = false;
  bool isClusterLead = false;
  bool ACCCandidate = false;

  friend class preDDD;
  friend class BB_Scheduler;
  friend class BB_ACC_Scheduler;
  friend class SethiUllmanACCQueue;
  friend class SethiUllmanQueue;
  friend class LatencyQueue;
};

// The dependency graph for a basic block.
class preDDD {
  G4_Kernel &kernel;

  // The basic block to be scheduled.
  G4_BB *m_BB;

  // If this DDD has been built.
  bool IsDagBuilt = false;

  // All nodes to be built and scheduled.
  preNodeAlloc preNodeAllocator;
  std::vector<preNode *> SNodes;

  // Special node for the schedule region.
  preNode EntryNode;
  preNode ExitNode;

  // New operands to be added into live ones.
  // This auxiliary vector is built while processing one node.
  std::vector<std::pair<preNode *, Gen4_Operand_Number>> NewLiveOps;

  bool BTIIsRestrict;

public:
  preDDD(G4_Kernel &kernel, G4_BB *BB) : kernel(kernel), m_BB(BB) {
    BTIIsRestrict = getOptions()->getOption(vISA_ReorderDPSendToDifferentBti);
  }
  ~preDDD() = default;

  G4_Kernel &getKernel() const { return kernel; }
  G4_BB *getBB() const { return m_BB; }
  Options *getOptions() const { return kernel.getOptions(); }
  preNode *getEntryNode() { return &EntryNode; }
  preNode *getExitNode() { return &ExitNode; }
  std::vector<preNode *> &getNodes() { return SNodes; }

  // Build the data dependency graph.
  void buildGraph();

  // Initialize or clear per node state so that data dependency graph
  // could be used for scheduling.
  void reset(bool ReassignNodeID = false);

  // Dump the DDD into a text file
  // need RegisterPressure to get LiveOut info
  void dumpDagTxt(RegisterPressure &rp);
  // Dump the DDD into a dot file.
  void dumpDagDot();

  void buildGraphForACC();

  // Each instruction creates live nodes for adding dependency edges.
  struct LiveNode {
    LiveNode(preNode *N, Gen4_Operand_Number OpNum) : N(N), OpNum(OpNum) {}

    // The DAG node that this node belongs to.
    preNode *N;

    // This indicates which operand this node is tracking.
    Gen4_Operand_Number OpNum;

    // Check if this is a read/write operand.
    bool isWrite() const {
      return OpNum == Gen4_Operand_Number::Opnd_dst ||
             OpNum == Gen4_Operand_Number::Opnd_condMod ||
             OpNum == Gen4_Operand_Number::Opnd_implAccDst;
    }
    bool isRead() const { return !isWrite(); }

    friend void swap(LiveNode &a, LiveNode &b) {
      std::swap(a.N, b.N);
      std::swap(a.OpNum, b.OpNum);
    }
  };

private:
  // Keep live nodes while scanning the block.
  // Each declare is associated with a list of live nodes.
  std::unordered_map<const G4_Declare *, std::vector<LiveNode>> LiveNodes;

  // Use an extra list to track physically assigned nodes, I.e. a0.2 etc.
  std::vector<LiveNode> LivePhysicalNodes;

  // Use an extra list to track send message dependency.
  std::vector<preNode *> LiveSends;

  // The most recent scheduling barrier.
  preNode *prevBarrier = nullptr;

  // The core function for building the DAG.
  // This adds node to the DAG and adds any required edges
  // by checking the dependencies against the live nodes.
  void addNodeToGraph(preNode *N);

  // Create a new edge from pred->succ of type D.
  void addEdge(preNode *pred, preNode *succ, DepType D) {
    auto fn = [=](const preEdge &E) { return E.getNode() == succ; };
    if (pred->succ_end() ==
        std::find_if(pred->succ_begin(), pred->succ_end(), fn)) {
      pred->Succs.emplace_back(succ, D);
      succ->Preds.emplace_back(pred, D);
    }
  }

  void processBarrier(preNode *curNode, DepType Dep);
  void processSend(preNode *curNode);
  void addSrcOpndDep(preNode *curNode, G4_Declare *dcl,
                     Gen4_Operand_Number OpNum);
  void processReadWrite(preNode *curNode);
  void prune();
};

// Track and recompute register pressure for a block.
struct RegisterPressure {
  PointsToAnalysis *p2a = nullptr;
  GlobalRA *gra = nullptr;
  LivenessAnalysis *liveness = nullptr;
  RPE *rpe = nullptr;
  G4_Kernel &kernel;

  RegisterPressure(G4_Kernel &kernel, RPE *rpe) : rpe(rpe), kernel(kernel) {
    // Initialize rpe if not available.
    if (rpe == nullptr) {
      init();
    } else {
      liveness = const_cast<LivenessAnalysis *>(rpe->getLiveness());
      rpe->run();
    }
  }

  ~RegisterPressure() {
    // Delete only if owns the following objects.
    if (p2a) {
      delete p2a;
      delete gra;
      delete liveness;
      delete rpe;
    }
  }

  RegisterPressure(const RegisterPressure &other) = delete;
  RegisterPressure &operator=(RegisterPressure &other) = delete;

  void init() {
    p2a = new PointsToAnalysis(kernel.Declares, kernel.fg.getNumBB());
    p2a->doPointsToAnalysis(kernel.fg);
    gra = new GlobalRA(kernel, kernel.fg.builder->phyregpool, *p2a);
    // To properly track liveness for partially-written local variables.
    gra->markGraphBlockLocalVars();
    liveness = new LivenessAnalysis(*gra, G4_GRF | G4_ADDRESS | G4_INPUT |
                                              G4_FLAG | G4_SCALAR);
    liveness->computeLiveness();
    rpe = new RPE(*gra, liveness);
    rpe->run();
  }

  bool isLiveOut(G4_BB *bb, G4_Declare *Dcl) const {
    G4_RegVar *V = Dcl->getRegVar();
    return liveness->isLiveAtExit(bb, V->getId());
  }

  void recompute(G4_BB *BB) { rpe->runBB(BB); }

  // Return the register pressure in GRF for an instruction.
  unsigned getPressure(G4_INST *Inst) const {
    return rpe->getRegisterPressure(Inst);
  }

  // Return the max pressure in GRFs for this block.
  unsigned getPressure(G4_BB *bb, std::vector<G4_INST *> *Insts = nullptr) {
    unsigned Max = 0;
    for (auto Inst : *bb) {
      if (Inst->isPseudoKill())
        continue;
      unsigned Pressure = rpe->getRegisterPressure(Inst);
      if (Pressure > Max) {
        Max = Pressure;
        if (Insts) {
          Insts->clear();
          Insts->push_back(Inst);
        }
      } else if (Pressure == Max && Insts) {
        Insts->push_back(Inst);
      }
    }

    return Max;
  }

  void dump(G4_BB *bb, const char *prefix = "") {
    unsigned Max = 0;
    std::vector<G4_INST *> Insts;
    for (auto Inst : *bb) {
      if (Inst->isPseudoKill()) {
        std::cerr << "[---] ";
        Inst->dump();
        continue;
      }
      unsigned Pressure = rpe->getRegisterPressure(Inst);
      if (Pressure > Max) {
        Max = Pressure;
        Insts.clear();
        Insts.push_back(Inst);
      } else if (Pressure == Max) {
        Insts.push_back(Inst);
      }
      std::cerr << "[" << Pressure << "] ";
      Inst->dump();
    }
    std::cerr << prefix << "the max pressure is " << Max << "\n";
    std::cerr << "Max pressure instructions are: \n";
    for (auto Inst : Insts) {
      Inst->dump();
    }
    std::cerr << "\n\n";
  }
};

struct SchedConfig {
  enum {
    MASK_DUMP = 1U << 0,
    MASK_LATENCY = 1U << 1,
    MASK_SETHI_ULLMAN = 1U << 2,
    MASK_CLUSTTERING = 1U << 3,
    MASK_SKIP_HOLD = 1U << 4,
    MASK_NOT_ITERATE = 1U << 5,
  };
  unsigned Dump : 1;
  unsigned UseLatency : 1;
  unsigned UseSethiUllman : 1;
  unsigned DoClustering : 1;
  unsigned SkipHoldList : 1; // default 0 i.e. use hold list in latency-hiding
  unsigned DoNotIterate : 1; // default 0 i.e. iterative latency-scheduling

  explicit SchedConfig(unsigned Config)
      : Dump((Config & MASK_DUMP) != 0),
        UseLatency((Config & MASK_LATENCY) != 0),
        UseSethiUllman((Config & MASK_SETHI_ULLMAN) != 0),
        DoClustering((Config & MASK_CLUSTTERING) != 0),
        SkipHoldList((Config & MASK_SKIP_HOLD) != 0),
        DoNotIterate((Config & MASK_NOT_ITERATE) != 0) {}
};

#define SCHED_DUMP(X)                                                          \
  do {                                                                         \
    if (config.Dump) {                                                         \
      X;                                                                       \
    }                                                                          \
  } while (0)

// Scheduler on a single block.
class BB_Scheduler {
  // The kernel this block belongs to.
  G4_Kernel &kernel;

  // The data dependency graph for this block.
  preDDD &ddd;

  // Register pressure estimation and tracking.
  RegisterPressure &rp;

  // The most recent schedule result.
  std::vector<G4_INST *> schedule;
  unsigned CycleEstimation;
  // save the original list before any scheduling
  INST_LIST OrigInstList;

  // Options to customize scheduler.
  SchedConfig config;

  const LatencyTable &LT;

public:
  BB_Scheduler(G4_Kernel &kernel, preDDD &ddd, RegisterPressure &rp,
               SchedConfig config, const LatencyTable &LT)
      : kernel(kernel), ddd(ddd), rp(rp), config(config), LT(LT) {}
  ~BB_Scheduler() {
    schedule.clear();
    OrigInstList.clear();
  }

  G4_Kernel &getKernel() const { return kernel; }
  G4_BB *getBB() const { return ddd.getBB(); }

  // MaxPressure is the BB pressure before and after scheduling
  bool scheduleBlockForPressure(unsigned &MaxPressure, unsigned Threshold);

  // MaxPressure is the BB reg-pressure before and after scheduling
  // ReassignID of PreNodes when this is not 1st-round scheduling
  // KernelRP is the measure max reg-pressure of this kernel before scheduling
  bool scheduleBlockForLatency(unsigned &MaxPressure, bool ReassignID,
                               unsigned KernelRP);

private:
  void SethiUllmanScheduling();
  void LatencyScheduling(unsigned GroupingThreshold);
  bool verifyScheduling();

  // Relocate pseudo-kills right before its successors.
  void relocatePseudoKills();
  // Commit this scheduling if it reduces register pressure.
  bool commitIfBeneficial(unsigned &MaxRPE, bool IsTopDown = false,
                          unsigned NumGrfs = 128);
  // save the original inst list
  void saveOriginalList() {
    INST_LIST &CurInsts = getBB()->getInstList();
    OrigInstList.clear();
    OrigInstList.splice(OrigInstList.begin(), CurInsts, CurInsts.begin(),
                        CurInsts.end());
    vASSERT(CurInsts.empty());
  }
  // restore the original inst list
  void restoreOriginalList() {
    INST_LIST &CurInsts = getBB()->getInstList();
    vASSERT(CurInsts.size() == OrigInstList.size());
    CurInsts.clear();
    CurInsts.splice(CurInsts.begin(), OrigInstList, OrigInstList.begin(),
                    OrigInstList.end());
    rp.recompute(getBB());
  }
};

} // namespace

static bool isSlicedSIMD32(G4_Kernel &kernel) {
  // need special treatment with simd32-slicing during scheduling
  return (kernel.getSimdSize() == g4::SIMD32 &&
          kernel.fg.builder->getNativeExecSize() < g4::SIMD16);
}

static unsigned getRPReductionThreshold(G4_Kernel &kernel) {
  unsigned RPThreshold =
      kernel.getOptions()->getuInt32Option(vISA_preRA_MinRegThreshold);
  if (RPThreshold == 0) {
    // For SIMD32 prior to PVC, use a higher threshold for rp reduction,
    // as it may not be beneficial.
    RPThreshold = isSlicedSIMD32(kernel) ? kernel.getScaledGRFSize(PRESSURE_REDUCTION_THRESHOLD_SIMD32)
                                         : kernel.getScaledGRFSize(PRESSURE_REDUCTION_THRESHOLD);
  }
  return RPThreshold;
}

static unsigned getLatencyHidingThreshold(G4_Kernel &kernel, unsigned NumGrfs) {
  unsigned RPThreshold =
      kernel.getOptions()->getuInt32Option(vISA_preRA_ScheduleRPThreshold);
  if (RPThreshold == 0) {
    RPThreshold = 104;
  }
  return unsigned(RPThreshold * (std::max(NumGrfs, 128u) - 32u) / 96u);
}

preRA_Scheduler::preRA_Scheduler(G4_Kernel &k, RPE *rpe)
    : kernel(k), rpe(rpe), m_options(kernel.getOptions()) {}

preRA_Scheduler::~preRA_Scheduler() {}

bool preRA_Scheduler::run() {
  if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D) {
    // Do not run pre-RA scheduler for CM unless user forces it.
    if (!m_options->getOption(vISA_preRA_ScheduleForce))
      return false;
  }

  unsigned Threshold = getRPReductionThreshold(kernel);
  unsigned SchedCtrl = m_options->getuInt32Option(vISA_preRA_ScheduleCtrl);

  auto LT = LatencyTable::createLatencyTable(*kernel.fg.builder);
  SchedConfig config(SchedCtrl);
  RegisterPressure rp(kernel, rpe);
  // skip extreme test cases that scheduling does not good
  // if (kernel.fg.getNumBB() >= 10000 && rp.rpe->getMaxRP() >= 800)
  //   return false;

  bool Changed = false;
  for (auto bb : kernel.fg) {
    if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE) {
      SCHED_DUMP(std::cerr << "Skip block with instructions " << bb->size()
                           << "\n");
      continue;
    }

    if (kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID) &&
        (bb->getId() <
         kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID))) {
      SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
      continue;
    }

    if (kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID) &&
        (bb->getId() >
         kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID))) {
      SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
      continue;
    }

    unsigned MaxPressure = rp.getPressure(bb);
    if (MaxPressure <= Threshold && !config.UseLatency) {
      SCHED_DUMP(std::cerr << "Skip block with rp " << MaxPressure << "\n");
      continue;
    }

    SCHED_DUMP(rp.dump(bb, "Before scheduling, "));
    preDDD ddd(kernel, bb);
    BB_Scheduler S(kernel, ddd, rp, config, *LT);

    Changed |= S.scheduleBlockForPressure(MaxPressure, Threshold);
    Changed |= S.scheduleBlockForLatency(MaxPressure, Changed, 0);
  }
  if (kernel.getOptions()->getOption(vISA_PreSchedGRFPressure)) {
    rp.rpe->run();
    kernel.fg.builder->getJitInfo()->stats.maxGRFPressure = rp.rpe->getMaxRP();
  }
  return Changed;
}

preRA_RegSharing::preRA_RegSharing(G4_Kernel &k, RPE *rpe)
    : kernel(k), rpe(rpe) {}

preRA_RegSharing::~preRA_RegSharing() {}

bool preRA_RegSharing::run() {
  // General algorithm:
  //  1. Schedule for pressure
  //      - If RP is low (e.g. < 64, based on platform), set maximum number of
  //      threads
  //  2. Estimate number of threads [4 .. 12] based on initial RP
  //  3. Schedule for latency (obtain ILP, stalls, throughput)
  //  4. Compute cost of schedule
  //  5. Based on schedule cost:
  //      - Return ok (keep best schedule)
  //      - Goto 3

  if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D) {
    // Do not run pre-RA scheduler for CM unless user forces it.
    if (!kernel.getOptions()->getOption(vISA_preRA_ScheduleForce))
      return false;
  }

  bool changed = false;

  unsigned SchedCtrl =
      kernel.getOptions()->getuInt32Option(vISA_preRA_ScheduleCtrl);
  SchedConfig config(SchedCtrl);

  RegisterPressure rp(kernel, rpe);

  std::unordered_map<G4_BB *, unsigned int> rpBB;
  unsigned KernelPressure = 0;

  // Obtain register pressure estimate of every BB
  for (auto bb : kernel.fg) {
    if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE_RPE) {
      SCHED_DUMP(std::cerr << "Skip block with instructions " << bb->size()
                           << "\n");
      continue;
    }

    unsigned pressure = rp.getPressure(bb);
    rpBB[bb] = pressure;

    if (pressure > KernelPressure) {
      KernelPressure = pressure;
    }
  }

  // Obs: Heuristic considering PVC with 2 GRF modes as of 03/2020
  // If maximum register pressure is higher than default GRF mode,
  // assign the smallest number of threads to this kernel.
  if (!kernel.getOptions()->getuInt32Option(vISA_HWThreadNumberPerEU) &&
      (KernelPressure >
       kernel.getScaledGRFSize(PRESSURE_HIGH_THRESHOLD) -
           kernel.getOptions()->getuInt32Option(vISA_ReservedGRFNum))) {
    // Update number of threads, GRF, Acc and SWSB
    kernel.updateKernelToLargerGRF();
  }

  // skip extreme test cases that scheduling does not good
  // if (kernel.fg.getNumBB() >= 10000 && KernelPressure >= 800)
  //   return false;

  unsigned Threshold = getRPReductionThreshold(kernel);
  auto LT = LatencyTable::createLatencyTable(*kernel.fg.builder);

  for (auto bb : kernel.fg) {
    if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE) {
      SCHED_DUMP(std::cerr << "Skip block with instructions " << bb->size()
                           << "\n");
      continue;
    }

    if (kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID) &&
        (bb->getId() <
         kernel.getOptions()->getuInt32Option(vISA_ScheduleStartBBID))) {
      SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
      continue;
    }

    if (kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID) &&
        (bb->getId() >
         kernel.getOptions()->getuInt32Option(vISA_ScheduleEndBBID))) {
      SCHED_DUMP(std::cerr << "Skip BB" << bb->getId() << "\n");
      continue;
    }

    unsigned MaxPressure = rpBB.find(bb) == rpBB.end() ? 0 : rpBB[bb];
    if (MaxPressure <= Threshold && !config.UseLatency) {
      SCHED_DUMP(std::cerr << "Skip block with rp " << MaxPressure << "\n");
      continue;
    }

    SCHED_DUMP(rp.dump(bb, "Before scheduling, "));
    preDDD ddd(kernel, bb);
    BB_Scheduler S(kernel, ddd, rp, config, *LT);

    changed |= S.scheduleBlockForPressure(MaxPressure, Threshold);
      changed |= S.scheduleBlockForLatency(MaxPressure, changed, 0);
  }
  if (kernel.getOptions()->getOption(vISA_PreSchedGRFPressure)) {
    rp.rpe->run();
    kernel.fg.builder->getJitInfo()->stats.maxGRFPressure = rp.rpe->getMaxRP();
  }
  return changed;
}

bool BB_Scheduler::verifyScheduling() {
  std::set<G4_INST *> Insts;
  for (auto Inst : *(getBB()))
    Insts.insert(Inst);

  for (auto Inst : schedule) {
    if (Insts.count(Inst) != 1) {
      Inst->dump();
      return false;
    }
  }

  return true;
}

namespace {

// The base class that implements common functionalities used in
// different scheduling algorithms.
class QueueBase {
protected:
  // The data-dependency graph.
  preDDD &ddd;

  // Register pressure related data.
  RegisterPressure &rp;

  // Options to customize scheduler.
  SchedConfig config;

  // Ready nodes.
  std::vector<preNode *> Q;

  QueueBase(preDDD &ddd, RegisterPressure &rp, SchedConfig config)
      : ddd(ddd), rp(rp), config(config) {}

  virtual ~QueueBase() {}

public:
  preNode *getCurrTupleLead() const { return TheCurrTupleLead; }
  void setCurrTupleLead(preNode *N) {
    vASSERT(N->getInst()->getExecSize() == g4::SIMD8 ||
            N->getInst()->getExecSize() == g4::SIMD16);
    TheCurrTupleLead = N->getTupleLead();
    TheCurrTupleParts = N->getTupleParts();
  }
  void updateCurrTupleLead(preNode *N) {
    vASSERT(TheCurrTupleLead != nullptr);
    vASSERT(N->getTupleLead() == TheCurrTupleLead);
    TheCurrTupleParts--;
    if (TheCurrTupleParts == 0)
      TheCurrTupleLead = nullptr;
  }
  virtual void push(preNode *N) = 0;
  virtual preNode *pickNode() = 0;

protected:
  // The current (send) tuple lead.
  preNode *TheCurrTupleLead = nullptr;
  unsigned TheCurrTupleParts = 0;
};

// Queue for Sethi-Ullman scheduling to reduce register pressure.
class SethiUllmanQueue : public QueueBase {
  // Sethi-Ullman numbers.
  // max-reg-pressure for the sub-exp-tree starting from a node
  std::vector<int> MaxRegs;
  std::vector<int> DstSizes;

  // The clustering nodes.
  std::vector<preNode *> Clusterings;
  std::set<preNode *> Visited;

  // Scheduling in clustering mode.
  bool IsInClusteringMode = false;

public:
  SethiUllmanQueue(preDDD &ddd, RegisterPressure &rp, SchedConfig config)
      : QueueBase(ddd, rp, config) {
    init();
  }

  // Add a new ready node.
  void push(preNode *N) override {
    // Clustering nodes have been added.
    if (N->isClustered && !N->isClusterLead) {
      vASSERT(std::find(Clusterings.begin(), Clusterings.end(), N) !=
              Clusterings.end());
    } else {
      Q.push_back(N);
    }
  }

  // Schedule the top node.
  preNode *pickNode() override { return select(); }

  bool empty() const { return Q.empty() && Clusterings.empty(); }

private:
  // Initialize Sethi-Ullman numbers.
  void init();

  // Select next ready node to schedule.
  preNode *select();

  preNode *scheduleClusteringNode();

  // Compare two ready nodes and decide which one should be scheduled first.
  // Return true if N2 has a higher priority than N1, false otherwise.
  bool compare(preNode *N1, preNode *N2);

  // Compute the Sethi-Ullman number for a node.
  void calculateSethiUllmanNumber(preNode *N);
};

} // namespace

// This implements the idea in the paper by Appel & Supowit:
//
// Generalizations of the Sethi-Ullman algorithm for register allocation
//
void SethiUllmanQueue::calculateSethiUllmanNumber(preNode *N) {
  auto getDstByteSize = [&](preNode *Node) -> int {
    G4_INST *Inst = Node->getInst();
    if (!Inst)
      return 0;
    G4_DstRegRegion *Dst = Inst->getDst();
    if (Dst && Dst->getTopDcl()) {
      // If a variable lives out, then there is no extra cost to hold the
      // result.
      if (rp.isLiveOut(ddd.getBB(), Dst->getTopDcl()))
        return 0;
      auto rootDcl = Dst->getTopDcl();
      auto dclSize = rootDcl->getByteSize();
      auto alignBytes = static_cast<uint32_t>(rootDcl->getSubRegAlign()) * 2;
      if (dclSize < alignBytes) {
        dclSize = std::min(dclSize * 2, alignBytes);
      }
      return dclSize;
    }
    return 0;
  };

  vASSERT(N->getID() < MaxRegs.size());
  vASSERT(N->getID() < DstSizes.size());
  auto CurNum = MaxRegs[N->getID()];
  if (CurNum != 0 || DstSizes[N->getID()] != 0)
    return;
  // record the destination register requirement
  DstSizes[N->getID()] = getDstByteSize(N);
  // compute max-reg
  std::vector<std::pair<preNode *, int>> Preds;
  for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) {
    auto &Edge = *I;
    auto DepType = Edge.getType();
    if (DepType != RAW && DepType != WAW)
      continue;

    // Skip pseudo-kills as they are lifetime markers.
    auto DefInst = Edge.getNode()->getInst();
    if (DefInst && DefInst->isPseudoKill())
      continue;

    // Recurse on the predecessors.
    calculateSethiUllmanNumber(Edge.getNode());
    auto MaxReg = MaxRegs[Edge.getNode()->getID()];
    auto DstSize = DstSizes[Edge.getNode()->getID()];
    Preds.emplace_back(Edge.getNode(), MaxReg - DstSize);
  }

  vASSERT(CurNum == 0);
  if (Preds.size() > 0) {
    std::sort(Preds.begin(), Preds.end(),
              [](std::pair<preNode *, int> lhs, std::pair<preNode *, int> rhs) {
                return lhs.second < rhs.second;
              });
    for (unsigned i = 0, e = (unsigned)Preds.size(); i < e; ++i) {
      auto PN = Preds[i].first;
      auto DstSize = DstSizes[PN->getID()];
      auto MaxReg = MaxRegs[PN->getID()];
      CurNum = std::max(MaxReg, CurNum + DstSize);
    }
  }
  MaxRegs[N->getID()] = CurNum;
  return;
}

void SethiUllmanQueue::init() {
  auto &Nodes = ddd.getNodes();
  unsigned N = (unsigned)Nodes.size();
  MaxRegs.resize(N, 0);
  DstSizes.resize(N, 0);
  for (auto I = Nodes.rbegin(); I != Nodes.rend(); ++I) {
    calculateSethiUllmanNumber((*I));
  }

#if 0
    std::cerr << "\n\n";
    for (auto I = Nodes.rbegin(); I != Nodes.rend(); ++I) {
        std::cerr << "MaxRegs[" << MaxRegs[(*I)->getID()] << "] ";
        (*I)->getInst()->dump();
    }
    std::cerr << "\n\n";
#endif
}

// Compare two ready nodes and decide which one should be scheduled first.
// Return true if N2 has a higher priority than N1, false otherwise.
bool SethiUllmanQueue::compare(preNode *N1, preNode *N2) {
  // TODO. Introduce heuristics before comparing SU numbers.
  vASSERT(N1->getID() < MaxRegs.size());
  vASSERT(N2->getID() < MaxRegs.size());
  vASSERT(N1->getID() != N2->getID());

  // Pseudo kill always has higher priority.
  if (N1->getInst()->isPseudoKill())
    return false;

  // Prefer to unlock a pending clustering node.
  if (IsInClusteringMode) {
    // Only kick in when top clustering node is not ready.
    vASSERT(!Clusterings.empty());
    preNode *Top = Clusterings.back();
    if (Top->NumSuccsLeft > 0) {
      for (auto &SuccN : Top->succs()) {
        if (SuccN.getNode() == N1)
          return false;
        if (SuccN.getNode() == N2)
          return true;
      }
    }
  }

  auto SU1 = MaxRegs[N1->getID()] - DstSizes[N1->getID()];
  auto SU2 = MaxRegs[N2->getID()] - DstSizes[N2->getID()];

  // This is a bottom-up scheduling. Smaller SU number means higher priority.
  if (SU1 < SU2)
    return false;

  if (SU1 > SU2)
    return true;

  // Otherwise, break tie with their IDs. Smaller ID means higher priority.
  return N1->getID() > N2->getID();
}

preNode *SethiUllmanQueue::scheduleClusteringNode() {
  // Clustering does not work well for SIMD32 before PVC
  // because all instructions are sliced into SIMD16
  if (isSlicedSIMD32(ddd.getKernel()))
    return nullptr;

  // Schedule clustering nodes first.
  if (IsInClusteringMode && !Clusterings.empty()) {
    // Pop off already scheduled node, if any.
    while (!Clusterings.empty() && Clusterings.back()->isScheduled)
      Clusterings.pop_back();

    // All are scheduled, ending clustering mode.
    if (Clusterings.empty()) {
      IsInClusteringMode = false;
      return nullptr;
    }

    // The next clustering node is not ready yet.
    preNode *Top = Clusterings.back();
    if (Top->NumSuccsLeft > 0)
      return nullptr;

    // The next clustering node is ready and not scheduled yet.
    Clusterings.pop_back();
    if (Clusterings.empty())
      IsInClusteringMode = false;
    return Top;
  }

  // The width limit of clustering.
  const unsigned CLUSTER_SIZE_MIN = 3;
  const unsigned CLUSTER_SIZE_MAX = 8;

  // Match clustering nodes.
  auto collectClustering = [&](preNode *Node, preNode *predNode) {
    for (auto &E : predNode->succs()) {
      preNode *N = E.getNode();
      // Match nodes may not be ready.
      if (!E.isDataDep() || N->isScheduled)
        continue;
      // Do not cluster sends, which may confuse send pairing.
      if (N->getInst() == nullptr || N->getInst()->isSend())
        continue;
      Clusterings.push_back(N);
    }

    // Check if the first matching is successful.
    if (unsigned(Clusterings.size()) == predNode->NumSuccsLeft &&
        unsigned(Clusterings.size()) >= CLUSTER_SIZE_MIN &&
        unsigned(Clusterings.size()) <= CLUSTER_SIZE_MAX)
      return true;

    // Check if the second matching is successful.
    if (unsigned(Q.size()) >= CLUSTER_SIZE_MIN) {
      Clusterings.clear();
      for (auto &E : predNode->succs()) {
        preNode *N = E.getNode();
        // Only match ready nodes.
        if (!E.isDataDep() || N->isScheduled || N->NumSuccsLeft)
          continue;
        // Do not cluster sends, which may confuse send pairing.
        if (N->getInst() == nullptr || N->getInst()->isSend())
          continue;
        Clusterings.push_back(N);
      }
      if (unsigned(Clusterings.size()) == predNode->NumSuccsLeft &&
          unsigned(Clusterings.size()) >= CLUSTER_SIZE_MIN &&
          unsigned(Clusterings.size()) <= CLUSTER_SIZE_MAX) {
        return true;
      }
    }

    Clusterings.clear();
    return false;
  };

  if (config.DoClustering && Clusterings.empty()) {
    for (auto Node : Q) {
      for (auto I = Node->pred_begin(), E = Node->pred_end(); I != E; ++I) {
        if (I->isDataDep()) {
          preNode *predN = I->getNode();
          if (!Visited.insert(predN).second)
            continue;
          if (collectClustering(Node, predN))
            break;
        }
      }

      if (!Clusterings.empty()) {
        for (auto N : Clusterings)
          N->isClustered = true;

        Q.erase(std::remove_if(Q.begin(), Q.end(),
                               [](preNode *N) { return N->isClustered; }),
                Q.end());

        std::sort(
            Clusterings.begin(), Clusterings.end(),
            [](preNode *A, preNode *B) { return A->getID() > B->getID(); });

        // We put the leading node back to the regular queue to
        // participate SU number comparison.
        preNode *Top = Clusterings.back();
        Top->isClusterLead = true;
        if (Top->NumSuccsLeft == 0) {
          Clusterings.pop_back();
          Q.push_back(Top);
          return nullptr;
        }
        break;
      }
    }
  }

  return nullptr;
}

preNode *SethiUllmanQueue::select() {
  if (auto Top = scheduleClusteringNode())
    return Top;

  vASSERT(!Q.empty());
  auto TopIter = Q.end();
  for (auto I = Q.begin(), E = Q.end(); I != E; ++I) {
    preNode *N = *I;
    // If there's a node to be paired, skip send not in pair.
    if (N->getInst() && N->getInst()->isSend())
      if (TheCurrTupleLead && N->getTupleLead() != TheCurrTupleLead)
        continue;
    if (TopIter == Q.end() || compare(*TopIter, *I))
      TopIter = I;
  }

  // In rare cases, there is a cycle due to send pairing.
  // Stop this heuristics if it happens.
  if (TopIter == Q.end()) {
    TheCurrTupleLead = nullptr;
    TheCurrTupleParts = 0;
    for (auto I = Q.begin(), E = Q.end(); I != E; ++I) {
      if (TopIter == Q.end() || compare(*TopIter, *I))
        TopIter = I;
    }
  }

  vASSERT(TopIter != Q.end());
  preNode *Top = *TopIter;
  std::swap(*TopIter, Q.back());
  Q.pop_back();

  // This selected node is clustered. From now on, schedule all
  // other clustered nodes.
  if (Top->isClustered) {
    IsInClusteringMode = true;
    return Top;
  }

  return Top;
}

// The basic idea is...
//
bool BB_Scheduler::scheduleBlockForPressure(unsigned &MaxPressure,
                                            unsigned Threshold) {
  auto tryRPReduction = [=]() {
    if (!config.UseSethiUllman)
      return false;
    return MaxPressure >= Threshold;
  };

  bool Changed = false;
  if (tryRPReduction()) {
    ddd.buildGraph();
    if (kernel.getOptions()->getOption(vISA_DumpDagTxt)) {
      ddd.dumpDagTxt(rp);
    }
    SethiUllmanScheduling();
    if (commitIfBeneficial(MaxPressure)) {
      SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for presssure, "));
      Changed = true;
    } else if (!config.DoClustering &&
               !isSlicedSIMD32(ddd.getKernel())) { // try clustering
      ddd.reset(false);
      auto SaveClustering = config.DoClustering;
      config.DoClustering = 1;
      SethiUllmanScheduling();
      config.DoClustering = SaveClustering;
      if (commitIfBeneficial(MaxPressure)) {
        SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for presssure, "));
        Changed = true;
      }
    }
  }
  return Changed;
}

void BB_Scheduler::SethiUllmanScheduling() {
  schedule.clear();
  SethiUllmanQueue Q(ddd, rp, config);
  Q.push(ddd.getExitNode());

  while (!Q.empty()) {
    preNode *N = Q.pickNode();
    vASSERT(!N->isScheduled && N->NumSuccsLeft == 0);
    if (N->getInst() != nullptr) {
      // std::cerr << "emit: "; N->getInst()->dump();
      if (N->getInst()->isSend() && N->getTupleLead()) {
        // If it's the pair of the current node, reset the node to be
        // paired. If it's send with pair, ensure its pair is scheduled
        // before other sends by setting the current node to be paired.
        if (!Q.getCurrTupleLead())
          Q.setCurrTupleLead(N);
        if (Q.getCurrTupleLead())
          Q.updateCurrTupleLead(N);
      }
      schedule.push_back(N->getInst());
      N->isScheduled = true;
    }

    for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) {
      preNode *Node = I->getNode();
      vASSERT(!Node->isScheduled && Node->NumSuccsLeft);
      --Node->NumSuccsLeft;
      if (Node->NumSuccsLeft == 0)
        Q.push(Node);
    }
  }

  vASSERT(verifyScheduling());
}

namespace {

// Queue for scheduling to hide latency.
class LatencyQueue : public QueueBase {
  // Assign a priority to each node.
  std::vector<unsigned> Priorities;

  // Map each instruction to a group ID. Instructions in the same
  // group will be scheduled for latency.
  std::map<G4_INST *, unsigned> GroupInfo;

  // Instruction latency information.
  const LatencyTable &LT;

  // TODO: Try to apply priority queue to SethiUllmanQueue as well.

  // nodes with all predecessors scheduled and ready-cycle <= current-cycle for
  // topdown scheduling
  std::priority_queue<preNode *, std::vector<preNode *>,
                      std::function<bool(preNode *, preNode *)>>
      ReadyList;
  // nodes with all predecessors scheduled and ready-cycle > current-cycle for
  // topdown scheduling
  std::priority_queue<preNode *, std::vector<preNode *>,
                      std::function<bool(preNode *, preNode *)>>
      HoldList;
  // The register-pressure limit we use to decide sub-blocking
  unsigned GroupingPressureLimit;

  // The list is to use to track active scheduled nodes that write flag.
  std::list<preNode *> FlagWrites;
  // Numbers of flag registers from getNumFlagRegisters()
  const unsigned FlagRegNum;

public:
  LatencyQueue(preDDD &ddd, RegisterPressure &rp, SchedConfig config,
               const LatencyTable &LT, unsigned GroupingThreshold)
      : QueueBase(ddd, rp, config), LT(LT),
        ReadyList(
            [this](preNode *a, preNode *b) { return compareReady(a, b); }),
        HoldList([this](preNode *a, preNode *b) { return compareHold(a, b); }),
        GroupingPressureLimit(GroupingThreshold),
        FlagRegNum(ddd.getKernel().fg.builder->getNumFlagRegisters()) {
    init();
  }

  // Add a new node to queue.
  void push(preNode *N) override {
    // Always add pseudo_kill to ready-list directly so that group info
    // advancing won't be affected when moving pseudo_kill from host-list to
    // ready-list.
    if (config.SkipHoldList)
      ReadyList.push(N);
    else if (N->getInst() && N->getInst()->isPseudoKill())
      ReadyList.push(N);
    else
      HoldList.push(N);
  }

  // Pick a node based on heuristics and the heuristics should be ordered based
  // on their priority.
  preNode *pickNode() override {
    vASSERT(!ReadyList.empty());
    preNode *N = nullptr;

    if (!N)
      N = selectCandidateToAvoidFlagSpill();

    if (!N)
      N = select();

    vASSERT(N);
    return N;
  }

  bool empty() const { return ReadyList.empty(); }

  // move instruction from HoldList to ReadyList,
  // also update current-cycle and current-group
  void advance(unsigned &CurCycle, unsigned &CurGroup) {
    if (config.SkipHoldList) {
      vASSERT(HoldList.empty());
      return;
    }
    GroupInfo[nullptr] = CurGroup;
    // move inst out of hold-list based on current group and cycle
    while (!HoldList.empty()) {
      preNode *N = HoldList.top();
      if (GroupInfo[N->getInst()] <= CurGroup &&
          N->getReadyCycle() <= CurCycle) {
        HoldList.pop();
        ReadyList.push(N);
      } else
        break;
    }
    // ready-list is still empty, then we need to move forward to
    // the next group or the next cycle so that some instructions
    // can come out of the hold-list.
    if (ReadyList.empty() && !HoldList.empty()) {
      preNode *N = HoldList.top();
      CurCycle = std::max(CurCycle, N->getReadyCycle());
      CurGroup = std::max(CurGroup, GroupInfo[N->getInst()]);
      do {
        preNode *N = HoldList.top();
        if (GroupInfo[N->getInst()] <= CurGroup &&
            N->getReadyCycle() <= CurCycle) {
          HoldList.pop();
          ReadyList.push(N);
        } else
          break;
      } while (!HoldList.empty());
    }
  }

  void updateFlagUsage(preNode* N) {
    vASSERT(N && N->isScheduled);
    // Only update flag usage when the scheduled node uses flag through
    // condition modifier and predicate.
    G4_INST *NInst = N->getInst();
    if (!NInst || !NInst->usesFlag())
      return;

    auto allFlagUsesScheduled = [](preNode *FW) -> bool {
      return std::all_of(FW->Succs.begin(), FW->Succs.end(), [FW](preEdge &E) {
        preNode *Succ = E.getNode();
        G4_INST *SuccInst = Succ->getInst();
        return !SuccInst ||
            !SuccInst->getPredicate() ||
            (SuccInst->getPredicate()->getBase() !=
                FW->getInst()->getCondMod()->getBase()) ||
            Succ->isScheduled;
      });
    };

    // Erase a flag write if
    // 1. the new node that has the same cond mod, or
    // 2. all of its successors that use the flag are scheduled
    bool writeFlag = NInst->getCondMod() && NInst->opcode() != G4_sel;
    for (auto it = FlagWrites.begin(), ie = FlagWrites.end(); it != ie; ) {
      preNode *FW = *it;
      if (writeFlag && NInst->getCondMod()->getBase() ==
                           FW->getInst()->getCondMod()->getBase()) {
        it = FlagWrites.erase(it);
      } else if (allFlagUsesScheduled(FW)) {
        it = FlagWrites.erase(it);
      } else
        ++it;
    }
    // Append the new flag write to the end.
    if (writeFlag)
      FlagWrites.push_back(N);
  }

private:
  void init();
  unsigned calculatePriority(preNode *N);

  // Select the top node.
  preNode *select() {
    preNode *N = ReadyList.top();
    ReadyList.pop();
    SCHED_DUMP({
      std::cerr << "Picking a node using the default heuristic: ";
      N->dump();
    });
    return N;
  }

  // Select a candidate that won't increase flag pressure and could avoid flag
  // spills potentially. Note that currently this heuristic only considers
  // flag uses in condition modifier and predicate, and does not consider flag
  // in src or dst.
  preNode *selectCandidateToAvoidFlagSpill() {
    // Only try this heuristic when current flag pressure is high.
    if (FlagWrites.size() < FlagRegNum)
      return nullptr;

    auto useAnyActiveFlag = [&](preNode *N) -> bool {
      return N->getInst() && N->getInst()->getPredicate() &&
          std::any_of(FlagWrites.begin(), FlagWrites.end(), [N](preNode *FW) {
              return FW->getInst()->getCondMod()->getBase() ==
                  N->getInst()->getPredicate()->getBase();});
    };

    std::vector<preNode *> Noninterest;
    preNode *N = nullptr;
    while (!ReadyList.empty()) {
      preNode *X = ReadyList.top();
      ReadyList.pop();
      // 1. Pick a node that does not use flag at all.
      if (!X->getInst() || !X->getInst()->usesFlag()) {
        N = X;
        break;
      }
      // 2. Pick a node that uses any active flag.
      if (useAnyActiveFlag(X)) {
        N = X;
        break;
      }
      Noninterest.push_back(X);
    }

    // Add noninterest nodes back to the ready list.
    for (preNode *Node : Noninterest)
      ReadyList.push(Node);

    SCHED_DUMP({
      if (!N) {
        std::cerr << "Unable to pick a node not to increase flag pressure.\n";
      } else {
        std::cerr << "Picking a node to avoid flag spills ";
        if (!N->getInst() || !N->getInst()->usesFlag())
          std::cerr << "(NO_FLAG_USES) : ";
        else
          std::cerr << "(USE_ACTIVE_FLAG) : ";
        N->dump();
      }
    });

    return N;
  }

  // Compare two ready nodes and decide which one should be scheduled first.
  // Return true if N2 has a higher priority than N1, false otherwise.
  bool compareReady(preNode *N1, preNode *N2);

  bool compareHold(preNode *N1, preNode *N2);

  bool comparePseudoKill(preNode *N1, preNode *N2) const;
};

} // namespace

//
bool BB_Scheduler::scheduleBlockForLatency(unsigned &MaxPressure,
                                           bool ReassignID, unsigned KernelRP) {
  auto tryLatencyHiding = [=](unsigned nr) {
    if (!config.UseLatency)
      return false;

    // KernelRP == 0 means we are scheduling for the fixed number of GRF
    if (KernelRP == 0 && MaxPressure >= getLatencyHidingThreshold(kernel, nr))
      return false;

    // simple ROI check.
    unsigned NumOfHighLatencyInsts = 0;
    for (auto Inst : *(ddd.getBB())) {
      if (Inst->isSend()) {
        G4_SendDesc *MsgDesc = Inst->getMsgDesc();
        if (MsgDesc->isRead() || MsgDesc->isSampler() || MsgDesc->isAtomic())
          NumOfHighLatencyInsts++;
      }
    }

    return NumOfHighLatencyInsts >= 2;
  };

  unsigned NumGrfs =
      kernel.getNumRegTotal() +
      kernel.getOptions()->getuInt32Option(vISA_preRA_ScheduleExtraGRF);

  if (!tryLatencyHiding(NumGrfs))
    return false;

  // UpperBoundGRF == NumGrfs means we only schedule under single NumGRF
  // setting for this block instead of trying to find the best schedule
  // among multiple NumGRF setting.
  unsigned UpperBoundGRF = NumGrfs;
  unsigned SavedEstimation = 0;
  std::vector<G4_INST *> SavedSchedule;

  // multiple settings are applied only to some blocks to save time
  if (KernelRP > 0 && MaxPressure > 40 && MaxPressure * 2 > KernelRP)
    UpperBoundGRF = std::max(256U, UpperBoundGRF);

  for (; NumGrfs <= UpperBoundGRF; NumGrfs += 32) {
    // try grouping-threshold decremently until we find a schedule likely won't
    // spill
    unsigned Thresholds[] = {144, 128, 112, 104, 96};
    unsigned Iterations = 5;
    float Ratio = (std::max(NumGrfs, 128u) - 48u) / 80.0f;
    // limit the iterative approach to certain platforms for now
    if (config.DoNotIterate) {
      Thresholds[0] = getLatencyHidingThreshold(kernel, NumGrfs);
      Iterations = 1;
      Ratio = 1.0f; // already adjusted inside getLatencyHidingThreshold
    }
    for (unsigned i = 0; i < Iterations; ++i) {
      auto GroupingThreshold = Thresholds[i];
      ddd.reset(ReassignID);
      ReassignID = false; // only reassign inst-local-id at most once
      LatencyScheduling(unsigned(GroupingThreshold * Ratio));
      if (commitIfBeneficial(MaxPressure, /*IsTopDown*/ true, NumGrfs)) {
        if (NumGrfs >= UpperBoundGRF && SavedEstimation == 0) {
          SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for latency, "));
          return true;
        }
        // if this schedule does not provide expected gain from
        // previous schedule, stop searching
        if (SavedEstimation > 0 && SavedSchedule.size() == schedule.size() &&
            CycleEstimation * 4 > SavedEstimation * 3) {
          // commit the previous schedule as the best
          INST_LIST &CurInsts = getBB()->getInstList();
          CurInsts.clear();
          for (auto Inst : SavedSchedule)
            CurInsts.push_back(Inst);
          rp.recompute(getBB());
          SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for latency, "));
          return true;
        }
        if (NumGrfs >= UpperBoundGRF) {
          // best schedule is found with the max GRF setting
          SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for latency, "));
          return true;
        }
        // save the current schedule as the potential choice
        // but do not commit it
        SavedSchedule.swap(schedule);
        SavedEstimation = CycleEstimation;
        schedule.clear();
        // restore the original instruction order
        restoreOriginalList();
        // try the next GRF level
        break;
      }
    }
  }
  if (SavedEstimation > 0 && SavedSchedule.size() > 0) {
    // commit the previous schedule as the best
    INST_LIST &CurInsts = getBB()->getInstList();
    vASSERT(SavedSchedule.size() == CurInsts.size());
    CurInsts.clear();
    for (auto Inst : SavedSchedule)
      CurInsts.push_back(Inst);
    rp.recompute(getBB());
    SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for latency, "));
    return true;
  }
  return false;
}

// Scheduling block to hide latency (top down).
void BB_Scheduler::LatencyScheduling(unsigned GroupingThreshold) {
  schedule.clear();
  LatencyQueue Q(ddd, rp, config, LT, GroupingThreshold);
  Q.push(ddd.getEntryNode());

  unsigned CurrentCycle = 0;
  unsigned CurrentGroup = 0;
  Q.advance(CurrentCycle, CurrentGroup);
  while (!Q.empty()) {
    preNode *N = Q.pickNode();
    vASSERT(N->NumPredsLeft == 0);
    // Set NextCycle as max of CurrentCycle and N->getReadyCycle() to include
    // the stall.
    unsigned NextCycle = std::max(CurrentCycle, N->getReadyCycle());
    if (N->getInst()) {
      schedule.push_back(N->getInst());
      NextCycle += LT.getOccupancy(N->getInst());
    }
    N->isScheduled = true;
    // Update flag usage after scheduling a node.
    Q.updateFlagUsage(N);

    for (auto I = N->succ_begin(), E = N->succ_end(); I != E; ++I) {
      preNode *Node = I->getNode();
      vASSERT(!Node->isScheduled && Node->NumPredsLeft);
      int L = (*I).getLatency();
      vASSERT(L >= 0);
      if (Node->getReadyCycle() < CurrentCycle + (unsigned)L)
        Node->setReadyCycle(CurrentCycle + (unsigned)L);
      --Node->NumPredsLeft;
      if (Node->NumPredsLeft == 0) {
        Q.push(Node);
      }
    }
    CurrentCycle = NextCycle;
    Q.advance(CurrentCycle, CurrentGroup);
  }
  CycleEstimation = CurrentCycle;
  relocatePseudoKills();
  vASSERT(verifyScheduling());
}

static void mergeSegments(const std::vector<unsigned> &RPtrace,
                          const std::vector<unsigned> &Max,
                          const std::vector<unsigned> &Min,
                          std::vector<unsigned> &Segments, unsigned Threshold) {
  unsigned n = std::min<unsigned>((unsigned)Max.size(), (unsigned)Min.size());
  vASSERT(n >= 2);

  // Starts with a local minimum.
  /*
      C        /\
              /  \
      A \    /    \
      D  \  /      \
      B   \/
  */
  if (Min[0] < Max[0]) {
    unsigned Hi = RPtrace[0];
    unsigned Lo = RPtrace[Min[0]];

    for (unsigned i = 1; i < n; ++i) {
      unsigned Hi2 = RPtrace[Max[i - 1]];
      unsigned Lo2 = RPtrace[Min[i]];

      if ((Hi2 - Lo + Hi) <= Threshold) {
        Hi = Hi2 - Lo + Hi;
        Lo = Lo2;
      } else {
        // Do not merge two segments, and end the last group.
        Segments.push_back(Min[i - 1]);
        Hi = Hi2;
        Lo = Lo2;
      }
    }

    // If ends with a local maximal, then Hi2 is the last max;
    // otherwise it is the end pressure.
    unsigned Hi2 =
        (Min.back() > Max.back()) ? RPtrace.back() : RPtrace[Max.back()];
    if ((Hi2 - Lo + Hi) > Threshold)
      Segments.push_back(Min.back());
    return;
  }

  // Starts with local maximum.
  /*
      D             /\
                   /  \
      B     /\    /    \
           /  \  /      \
      C   /    \/
         /
      A /
  */
  unsigned Hi = RPtrace[Max[0]];
  unsigned Lo = RPtrace[Min[0]];
  for (unsigned i = 1; i < n; ++i) {
    unsigned Hi2 = RPtrace[Max[i]];
    unsigned Lo2 = RPtrace[Min[i]];

    // Merge two segments
    if ((Hi2 - Lo + Hi) <= Threshold) {
      Hi = Hi2 - Lo + Hi;
      Lo = Lo2;
    } else {
      // Do not merge two segments, and end the last group.
      Segments.push_back(Min[i - 1]);
      Hi = Hi2;
      Lo = Lo2;
    }
  }

  // If ends with a local maximal, then Hi2 is the last max;
  // otherwise it is the end pressure.
  unsigned Hi2 =
      (Min.back() > Max.back()) ? RPtrace.back() : RPtrace[Max.back()];
  if ((Hi2 - Lo + Hi) > Threshold)
    Segments.push_back(Min.back());
}

void LatencyQueue::init() {
  G4_BB *BB = ddd.getBB();

  // Scan block forward and group instructions, without causing
  // excessive pressure increase. First collect the register
  // pressure trace in this block.
  std::vector<unsigned> RPtrace;
  RPtrace.reserve(BB->size());
  for (auto Inst : *BB) {
    // Ignore pseudo-kills as they often introduce inaccurate rp bumps.
    if (Inst->isPseudoKill()) {
      unsigned prevRP = RPtrace.empty() ? 0 : RPtrace.back();
      RPtrace.push_back(prevRP);
    } else
      RPtrace.push_back(rp.getPressure(Inst));
  }

  // Collect all local maximum and minimum.
  std::vector<unsigned> Max, Min;
  bool IsIncreasing = true;
  for (unsigned i = 1; i + 1 < RPtrace.size(); ++i) {
    if (RPtrace[i] > RPtrace[i - 1])
      IsIncreasing = true;
    else if (RPtrace[i] < RPtrace[i - 1])
      IsIncreasing = false;

    if (RPtrace[i] > RPtrace[i - 1] && RPtrace[i] > RPtrace[i + 1])
      Max.push_back(i);
    else if (IsIncreasing && RPtrace[i] == RPtrace[i - 1] &&
             RPtrace[i] > RPtrace[i + 1])
      Max.push_back(i);
    else if (RPtrace[i] < RPtrace[i - 1] && RPtrace[i] < RPtrace[i + 1])
      Min.push_back(i);
    else if (!IsIncreasing && RPtrace[i] == RPtrace[i - 1] &&
             RPtrace[i] < RPtrace[i + 1])
      Min.push_back(i);
  }

  if (Max.size() <= 1 || Min.size() <= 1) {
    // Simple case, there is only a single group.
    for (auto Inst : *BB)
      GroupInfo[Inst] = 0;
  } else {
    // Multiple high/low pressure segments. We merge consective segments
    // without breaking rpe limit. E.g. given [A, B, C, D, E] with A < B >
    // C and C < D > E, B, D are local maximum and C is a local minimum.
    // We merge [A, B, C] with [C, D, E] when B + D - C <= THRESHOLD,
    // resulting a larger segment [A, B + D - C, E]. Approximately,
    // B + D - C bounds the maximal pressure in this merged segment [A, E].
    // Otherwise, do not merge [A, C] with [C, E], C ends the current group,
    // and starts a new group.
    //
    std::vector<unsigned> Segments;
    mergeSegments(RPtrace, Max, Min, Segments, GroupingPressureLimit);

    // Iterate segments and assign a group id to each insstruction.
    unsigned i = 0;
    unsigned j = 0;
    for (auto Inst : *BB) {
      if (j >= Segments.size()) {
        GroupInfo[Inst] = j;
      } else if (i < Segments[j]) {
        GroupInfo[Inst] = j;
      } else {
        GroupInfo[Inst] = j;
        ++j;
      }
      ++i;
    }
  }

  auto &Nodes = ddd.getNodes();
  unsigned N = (unsigned)Nodes.size();
  Priorities.resize(N, 0);
  for (unsigned i = 0; i < N; ++i)
    Priorities[i] = calculatePriority(Nodes[i]);

#if 0
    std::cerr << "\n\n";
    for (auto I = Nodes.rbegin(); I != Nodes.rend(); ++I) {
        std::cerr << "(GroupId, Priority, RPE) = ("
                  << std::setw(2)<< GroupInfo[(*I)->getInst()]
                  << ", " << std::setw(4) << Priorities[(*I)->getID()]
                  << ", " << std::setw(3)<< rp.getPressure((*I)->getInst())
                  << ") ";
        (*I)->getInst()->dump();
    }
    std::cerr << "\n\n";
#endif
}

unsigned LatencyQueue::calculatePriority(preNode *N) {
  G4_INST *Inst = N->getInst();
  if (!Inst)
    return 0;

  vASSERT(N->getID() < Priorities.size());
  unsigned CurPriority = Priorities[N->getID()];
  if (CurPriority > 0)
    return CurPriority;

  // Check if an edge is setting a0 operand for a send.
  auto isHeaderOnAddr = [](preNode *N, preEdge &E) {
    // Check if N is writing to address.
    G4_INST *Inst = N->getInst();
    if (!Inst || !Inst->getDst() || !Inst->getDst()->isDirectAddress())
      return false;

    // Check if this use is on send.
    preNode *T = E.getNode();
    if (T->getInst() && T->getInst()->isSend())
      return true;

    // By default.
    return false;
  };

  unsigned Priority = 0;
  for (auto I = N->succ_begin(), E = N->succ_end(); I != E; ++I) {
    // Recurse on the successors.
    auto &Edge = *I;
    unsigned SuccPriority = calculatePriority(Edge.getNode());
    unsigned Latency = 0;

    if (Inst && !Inst->isPseudoKill() && Edge.isDataDep()) {
      switch (Edge.getType()) {
      case RAW:
        // By setting Latency to 0, this moves address initializations
        // close to sends.
        if (isHeaderOnAddr(N, Edge))
          break;
        // fall through
      case RAW_MEMORY:
      case WAW:
        Latency = LT.getLatency(Inst);
        break;
      default:
        break;
      }
    }
    Edge.setLatency(Latency);
    Priority = std::max(Priority, SuccPriority + Latency);
  }

  return std::max(1U, Priority);
}

// Compare two ready nodes and decide which one should be scheduled first.
// Return true if N2 has a higher priority than N1, false otherwise.
bool LatencyQueue::compareReady(preNode *N1, preNode *N2) {
  vASSERT(N1->getID() != N2->getID());
  vASSERT(N1->getInst() && N2->getInst());

  auto isSendNoReturn = [](G4_INST *Inst) {
    if (Inst->isSend() &&
        (Inst->getDst() == nullptr || Inst->getDst()->isNullReg()))
      return true;
    return false;
  };

  G4_INST *Inst1 = N1->getInst();
  G4_INST *Inst2 = N2->getInst();
  if (Inst1->isPseudoKill() || Inst2->isPseudoKill())
    return comparePseudoKill(N1, N2);

  // Group ID has higher priority, smaller ID means higher priority.
  unsigned GID1 = GroupInfo[N1->getInst()];
  unsigned GID2 = GroupInfo[N2->getInst()];
  if (GID1 > GID2)
    return true;
  if (GID1 < GID2)
    return false;

  // Favor sends without return such as stores or urb-writes
  // because they likely release source registers
  if (isSendNoReturn(Inst1) && !isSendNoReturn(Inst2))
    return false;
  else if (!isSendNoReturn(Inst1) && isSendNoReturn(Inst2))
    return true;
  // Within the same group, compare their priority.
  unsigned P1 = Priorities[N1->getID()];
  unsigned P2 = Priorities[N2->getID()];
  if (P2 > P1)
    return true;
  if (P1 > P2)
    return false;

  // Favor sends.
  if (Inst1->isSend() && !Inst2->isSend())
    return false;
  else if (!Inst1->isSend() && Inst2->isSend())
    return true;

  // Otherwise, break tie on ID.
  // Larger ID means higher priority.
  return N2->getID() > N1->getID();
}

// hold-list is sorted by nodes' ready cycle
bool LatencyQueue::compareHold(preNode *N1, preNode *N2) {
  vASSERT(N1->getID() != N2->getID());
  vASSERT(N1->getInst() && N2->getInst());
  G4_INST *Inst1 = N1->getInst();
  G4_INST *Inst2 = N2->getInst();

  // Group ID has higher priority, smaller ID means higher priority.
  unsigned GID1 = GroupInfo[Inst1];
  unsigned GID2 = GroupInfo[Inst2];
  if (GID1 > GID2)
    return true;
  if (GID1 < GID2)
    return false;

  // compare ready cycle, smaller ready cycle means higher priority
  unsigned cyc1 = N1->getReadyCycle();
  unsigned cyc2 = N2->getReadyCycle();
  if (cyc1 > cyc2)
    return true;
  if (cyc1 < cyc2)
    return false;

  // Otherwise, break tie on ID.
  // Larger ID means higher priority.
  return N2->getID() > N1->getID();
}

bool LatencyQueue::comparePseudoKill(preNode *N1, preNode *N2) const {
  vASSERT(N1->getID() != N2->getID());
  vASSERT(N1->getInst() && N2->getInst());
  vASSERT(N1->getInst()->isPseudoKill() || N2->getInst()->isPseudoKill());
  // Break tie when both are pseudo_kill and larger ID means higher priority.
  if (N1->getInst()->isPseudoKill() && N2->getInst()->isPseudoKill())
    return N2->getID() > N1->getID();
  // Make pseudo_kill higher priority.
  if (N2->getInst()->isPseudoKill())
    return true;
  else
    return false;
}

// Find the edge with smallest ID.
static preNode *minElt(const std::vector<preEdge> &Elts) {
  vASSERT(!Elts.empty());
  if (Elts.size() == 1)
    return Elts.front().getNode();
  auto Cmp = [](const preEdge &E1, const preEdge &E2) {
    G4_INST *Inst1 = E1.getNode()->getInst();
    G4_INST *Inst2 = E2.getNode()->getInst();
    return Inst1 && Inst2 && Inst1->getLocalId() < Inst2->getLocalId();
  };
  auto E = std::min_element(Elts.begin(), Elts.end(), Cmp);
  return E->getNode();
}

// Given an instruction stream [p1 p2 A1 A2 A4 A3]
// with dependency p1 -> {A1, A2}, p2-> {A3, A4},
// for p in {p1, p2}, we compute the earliest location to insert
// the pseudo kill p, p1<-A1, p2<-A4, and shuffle the stream to
// [p1 A1 A2 p2 A4 A3].
void BB_Scheduler::relocatePseudoKills() {
  // Reset local id after scheduling and build the location map.
  // Multiple pseudo-kills may be placed before a single instruction.
  std::unordered_map<G4_INST *, std::vector<G4_INST *>> LocMap;
  std::vector<G4_INST *> KillsWithoutUse;
  int i = 0;
  for (auto Inst : schedule) {
    Inst->setLocalId(i++);
  }

  G4_INST *LastBarrier = nullptr;
  for (auto N : ddd.getNodes()) {
    G4_INST *Inst = N->getInst();
    // All dangling pseudo-kills shall be placed before a barrier.
    if (preNode::isBarrier(Inst)) {
      if (LastBarrier && !KillsWithoutUse.empty()) {
        LocMap[LastBarrier].swap(KillsWithoutUse);
        vASSERT(KillsWithoutUse.empty());
      }
      LastBarrier = Inst;
    }

    if (Inst && Inst->isPseudoKill()) {
      preNode *Pos = minElt(N->Succs);
      while (Pos->getInst() && Pos->getInst()->isPseudoKill())
        Pos = minElt(Pos->Succs);

      if (Pos->getInst() == nullptr)
        KillsWithoutUse.push_back(Inst);
      else
        LocMap[Pos->getInst()].push_back(Inst);
    }
  }

  // Do nothing if there is no pseudo-kill.
  if (LocMap.empty() && KillsWithoutUse.empty())
    return;

  // Do relocation.
  std::vector<G4_INST *> relocated;
  relocated.reserve(schedule.size());
  for (auto Inst : schedule) {
    // pseudo-kills will be relocated.
    if (Inst->isPseudoKill())
      continue;
    auto I = LocMap.find(Inst);
    if (I != LocMap.end())
      relocated.insert(relocated.end(), I->second.begin(), I->second.end());
    relocated.push_back(Inst);
  }

  // Put remaining dangling pseudo-kills at the end of the block.
  if (!KillsWithoutUse.empty())
    relocated.insert(relocated.end(), KillsWithoutUse.begin(),
                     KillsWithoutUse.end());

  std::swap(schedule, relocated);
}

// Commit this scheduling if it is better.
bool BB_Scheduler::commitIfBeneficial(unsigned &MaxRPE, bool IsTopDown,
                                      unsigned NumGrfs) {
  INST_LIST &CurInsts = getBB()->getInstList();
  if (schedule.size() != CurInsts.size()) {
    SCHED_DUMP(std::cerr << "schedule reverted due to mischeduling.\n\n");
    return false;
  }
  if (IsTopDown) {
    if (std::equal(CurInsts.begin(), CurInsts.end(), schedule.begin())) {
      SCHED_DUMP(std::cerr << "schedule not committed due to no change.\n\n");
      return false;
    }
  } else if (std::equal(CurInsts.begin(), CurInsts.end(), schedule.rbegin())) {
    SCHED_DUMP(std::cerr << "schedule not committed due to no change.\n\n");
    return false;
  }
  saveOriginalList();
  // evaluate this scheduling.
  if (IsTopDown)
    for (auto Inst : schedule)
      CurInsts.push_back(Inst);
  else
    for (auto Inst : schedule)
      CurInsts.push_front(Inst);

  rp.recompute(getBB());
  unsigned NewRPE = rp.getPressure(getBB());
  unsigned LatencyPressureThreshold =
      getLatencyHidingThreshold(kernel, NumGrfs);
  if (config.UseLatency && IsTopDown) {
    // For hiding latency.
    if (NewRPE <= LatencyPressureThreshold) {
      SCHED_DUMP(std::cerr << "schedule committed for latency.\n\n");
      MaxRPE = NewRPE;
      return true;
    } else {
      SCHED_DUMP(std::cerr << "the pressure is increased to " << NewRPE
                           << "\n");
    }
  } else {
    // For reducing rpe.
    if (NewRPE < MaxRPE &&
        (MaxRPE - NewRPE) * 100 >= PRESSURE_REDUCTION_MIN_BENEFIT * MaxRPE) {
      bool AbortOnSpill = kernel.getOptions()->getOption(vISA_AbortOnSpill);
      if (isSlicedSIMD32(kernel) && AbortOnSpill) {
        // It turns out that simd32 kernels may be scheduled like slicing, which
        // in general hurts latency hidding. If not insist to compile for
        // simd32, make rp reduction conservative.
        //
        if (NewRPE < LatencyPressureThreshold) {
          SCHED_DUMP(std::cerr
                     << "schedule committed with reduced pressure.\n\n");
          MaxRPE = NewRPE;
          return true;
        }
      } else {
        SCHED_DUMP(std::cerr
                   << "schedule committed with reduced pressure.\n\n");
        MaxRPE = NewRPE;
        return true;
      }
    } else if (NewRPE < MaxRPE) {
      SCHED_DUMP(std::cerr << "the reduced pressure is " << MaxRPE - NewRPE
                           << "\n");
    }
  }

  SCHED_DUMP(rp.dump(getBB(), "schedule reverted, "));
  restoreOriginalList();
  return false;
}

// Implementation of preNode.
DepType preNode::checkBarrier(G4_INST *Inst) {
  // Check if there is an indirect operand in this instruction.
  auto hasIndirectOpnd = [=]() {
    G4_DstRegRegion *dst = Inst->getDst();
    if (dst && dst->isIndirect())
      return true;
    for (auto opNum : {Opnd_src0, Opnd_src1, Opnd_src2}) {
      G4_Operand *opnd = Inst->getOperand(opNum);
      if (opnd && opnd->isSrcRegRegion() &&
          opnd->asSrcRegRegion()->isIndirect())
        return true;
    }
    return false;
  };

  if (Inst == nullptr)
    return DepType::OPT_BARRIER;
  else if (Inst->isLabel())
    return DepType::DEP_LABEL;
  else if (hasIndirectOpnd())
    return DepType::INDIRECT_ADDR_BARRIER;
  else if (Inst->isSend() && Inst->asSendInst()->isFence())
    return DepType::OPT_BARRIER;
  else if (Inst->opcode() == G4_madm)
    return DepType::OPT_BARRIER;
  else if (Inst->isDpas())
    return DepType::OPT_BARRIER;
  else
    return CheckBarrier(Inst);
}

void preNode::print(std::ostream &os) const {
  os << "ID: " << this->ID << "";
  if (Inst)
    Inst->emit(os);

  os << "Preds: ";
  for (auto &E : this->Preds)
    os << E.getNode()->ID << ",";
  os << "";

  os << "Succs: ";
  for (auto &E : this->Succs)
    os << E.getNode()->ID << ",";
  os << "\n";
}

void preNode::dump() const { print(std::cerr); }

// Implementation of preDDD.

// Build the data dependency bottom up with two simple
// special nodes.
void preDDD::buildGraph() {
  vASSERT(!IsDagBuilt);

  // Starts with the exit node.
  addNodeToGraph(&ExitNode);

  unsigned NumOfInsts = (unsigned)m_BB->size();
  SNodes.reserve(NumOfInsts);

  auto I = m_BB->rbegin(), E = m_BB->rend();
  for (unsigned i = 0; I != E; ++I) {
    preNode *N = new (preNodeAllocator) preNode(*I, i++);
    SNodes.push_back(N);
    addNodeToGraph(N);
  }

  // Ends with the entry node.
  addNodeToGraph(&EntryNode);

  // prune the graph.
  prune();

  // Set DAG is complete.
  IsDagBuilt = true;

  // Initialize perNode data.
  reset();
}

void preDDD::addNodeToGraph(preNode *N) {
  NewLiveOps.clear();
  DepType Dep = N->getBarrier();
  if (Dep != DepType::NODEP) {
    processBarrier(N, Dep);
  } else {
    vISA_ASSERT(N->Inst, "not an instruction");
    processSend(N);
    processReadWrite(N);
  }

  // Adding live node should happen in the end, as illustrated below:
  // add X X 1
  // add Y X 2
  for (auto &Item : NewLiveOps) {
    preNode *N = std::get<0>(Item);
    G4_INST *inst = N->getInst();
    Gen4_Operand_Number OpNum = std::get<1>(Item);
    G4_Operand *Opnd = inst->getOperand(OpNum);
    vASSERT(Opnd != nullptr);
    G4_Declare *Dcl = Opnd->getTopDcl();
    if (inst->isPseudoAddrMovIntrinsic() &&
        OpNum != Gen4_Operand_Number::Opnd_dst) {
      Dcl = Opnd->asAddrExp()->getRegVar()->getDeclare();
    }
    vASSERT(Dcl || Opnd->isPhysicallyAllocatedRegVar());
    if (Dcl) {
      LiveNodes[Dcl].emplace_back(N, OpNum);
    }

    if (Opnd->isPhysicallyAllocatedRegVar())
      LivePhysicalNodes.emplace_back(N, OpNum);
  }

  // Update live nodes on sends.
  G4_INST *Inst = N->getInst();
  if (Inst && Inst->isSend()) {
    vASSERT(!Inst->getMsgDesc()->isScratch());
    LiveSends.push_back(N);
  }

  // No explicit dependency found, so it depends on previous barrier.
  if (N->succ_empty() && N != prevBarrier) {
    vISA_ASSERT(prevBarrier, "out of sync");
    addEdge(N, prevBarrier, prevBarrier->getBarrier());
  }
}

void preDDD::processBarrier(preNode *curNode, DepType Dep) {
  // A barrier kills all live nodes, so add dependency edge to all live
  // nodes and clear.
  for (auto &Nodes : LiveNodes) {
    for (LiveNode &X : Nodes.second) {
      if (X.N->pred_empty()) {
        addEdge(curNode, X.N, Dep);
      }
    }
    Nodes.second.clear();
  }

  for (auto &X : LivePhysicalNodes) {
    if (X.N->pred_empty()) {
      addEdge(curNode, X.N, Dep);
    }
  }
  LivePhysicalNodes.clear();

  for (auto N : LiveSends) {
    if (N->pred_empty()) {
      addEdge(curNode, N, Dep);
    }
  }
  LiveSends.clear();

  // Add an edge when there is no edge on previous barrier.
  if (prevBarrier != nullptr && prevBarrier->pred_empty())
    addEdge(curNode, prevBarrier, Dep);
  prevBarrier = curNode;

  G4_INST *Inst = curNode->getInst();
  if (Inst == nullptr)
    return;

  for (auto OpNum :
       {Gen4_Operand_Number::Opnd_dst, Gen4_Operand_Number::Opnd_src0,
        Gen4_Operand_Number::Opnd_src1, Gen4_Operand_Number::Opnd_src2,
        Gen4_Operand_Number::Opnd_src3, Gen4_Operand_Number::Opnd_pred,
        Gen4_Operand_Number::Opnd_condMod, Gen4_Operand_Number::Opnd_implAccSrc,
        Gen4_Operand_Number::Opnd_implAccDst}) {
    G4_Operand *opnd = Inst->getOperand(OpNum);
    if (opnd == nullptr || opnd->getBase() == nullptr || opnd->isNullReg())
      continue;
    NewLiveOps.emplace_back(curNode, OpNum);
  }
}

// - Remove one element from vector if pred is true.
// - Return the iterator to the next element.
template <typename T, typename AllocTy>
static typename std::vector<T, AllocTy>::iterator
kill_if(bool pred, std::vector<T, AllocTy> &Elts,
        typename std::vector<T, AllocTy>::iterator Iter) {
  if (!pred)
    return std::next(Iter);

  vASSERT(Iter != Elts.end());
  vASSERT(!Elts.empty());
  // This is the last element so the next element is none.
  if (&*Iter == &Elts.back()) {
    Elts.pop_back();
    return Elts.end();
  }

  // This is not the last element, swap with the tail.
  // Keep the iterator unchanged.
  std::swap(*Iter, Elts.back());
  Elts.pop_back();
  return Iter;
}

// Compute {RAW,WAW,WAR,NODEP} for given operand to a live node.
static DepType getDep(G4_Operand *Opnd, const preDDD::LiveNode &LN) {
  DepType Deps[] = {DepType::NODEP, DepType::RAW, DepType::WAR, DepType::WAW};
  int i = int(LN.isWrite());
  int j = int(Opnd->isDstRegRegion() || Opnd->isCondMod());
  return Deps[i * 2 + j];
}

// Compute relation for given operand to a live node. This function may return
// a different dependency when checking acc dependency.
static std::pair<DepType, G4_CmpRelation>
getDepAndRel(G4_Operand *Opnd, const preDDD::LiveNode &LN, DepType Dep) {
  G4_CmpRelation Rel = G4_CmpRelation::Rel_undef;
  G4_Operand *Other = LN.N->getInst()->getOperand(LN.OpNum);
  vASSERT(Other != nullptr);

  if (Other) {
    const IR_Builder &builder = LN.N->getInst()->getBuilder();
    if (Opnd->isDstRegRegion())
      Rel = Opnd->asDstRegRegion()->compareOperand(Other, builder);
    else if (Opnd->isCondMod())
      Rel = Opnd->asCondMod()->compareOperand(Other, builder);
    else if (Opnd->isSrcRegRegion())
      Rel = Opnd->asSrcRegRegion()->compareOperand(Other, builder);
    else if (Opnd->isPredicate())
      Rel = Opnd->asPredicate()->compareOperand(Other, builder);
    else
      Rel = Opnd->compareOperand(Other, builder);

    if (Rel == G4_CmpRelation::Rel_disjoint) {
      // Check if there is any acc dependency on acc registers.
      G4_AccRegSel AccOpnd = Opnd->getAccRegSel();
      G4_AccRegSel AccOther = Other->getAccRegSel();

      // Normalize NOACC to ACC_UNDEFINED
      if (AccOpnd == G4_AccRegSel::NOACC)
        AccOpnd = G4_AccRegSel::ACC_UNDEFINED;
      if (AccOther == G4_AccRegSel::NOACC)
        AccOther = G4_AccRegSel::ACC_UNDEFINED;

      if (AccOther == AccOpnd && AccOther != G4_AccRegSel::ACC_UNDEFINED) {
        // While comparing V3:Acc2 to V4:Acc2, we cannot kill this live
        // node, as there is no overlap on V3 and V4. So only returns
        // Rel_interfere relation, not Rel_eq.
        //
        if (LN.isWrite() && Opnd->isDstRegRegion())
          return std::make_pair(DepType::WAW, Rel_interfere);
        if (LN.isWrite())
          return std::make_pair(DepType::WAR, Rel_interfere);
        if (Opnd->isDstRegRegion())
          return std::make_pair(DepType::RAW, Rel_interfere);
      }

      // No dependency.
      return std::make_pair(DepType::NODEP, Rel);
    }
  }
  return std::make_pair(Dep, Rel);
}

//Add the dependence edge for source operand
//Also, add the node to the active live list
void preDDD::addSrcOpndDep(preNode *curNode, G4_Declare *Dcl,
                           Gen4_Operand_Number OpNum) {
  G4_Operand *opnd = curNode->getInst()->getOperand(OpNum);

  if (Dcl) {
    auto &Nodes = LiveNodes[Dcl];
    // Iterate all live nodes associated to the same declaration.
    for (auto &liveNode : Nodes) {
      // Skip read live nodes.
      if (liveNode.isRead())
        continue;

      DepType Dep = getDep(opnd, liveNode);
      if (Dep == DepType::NODEP)
        continue;
      std::pair<DepType, G4_CmpRelation> DepRel =
          getDepAndRel(opnd, liveNode, Dep);
      if (DepRel.first != DepType::NODEP)
        addEdge(curNode, liveNode.N, DepRel.first);
    }
  }

  // If this is a physically allocated regvar, then check dependency on the
  // physically allocated live nodes. This should be a cold path.
  if (opnd->isPhysicallyAllocatedRegVar()) {
    for (auto &liveNode : LivePhysicalNodes) {
      // Skip read live nodes.
      if (liveNode.isRead())
        continue;
      DepType Dep = getDep(opnd, liveNode);
      if (Dep == DepType::NODEP)
        continue;
      std::pair<DepType, G4_CmpRelation> DepRel =
          getDepAndRel(opnd, liveNode, Dep);
      if (DepRel.first != DepType::NODEP)
        addEdge(curNode, liveNode.N, DepRel.first);
    }
  }

  NewLiveOps.emplace_back(curNode, OpNum);
}

// This is not a label nor a barrier and check the dependency
// introduced by this node.
void preDDD::processReadWrite(preNode *curNode) {
  G4_INST *Inst = curNode->getInst();
  for (auto OpNum :
       {Gen4_Operand_Number::Opnd_dst, Gen4_Operand_Number::Opnd_condMod,
        Gen4_Operand_Number::Opnd_implAccDst}) {
    G4_Operand *opnd = Inst->getOperand(OpNum);
    if (opnd == nullptr || opnd->getBase() == nullptr || opnd->isNullReg())
      continue;
    vASSERT(opnd->getTopDcl() || opnd->isPhysicallyAllocatedRegVar());
    if (G4_Declare *Dcl = opnd->getTopDcl()) {
      auto &Nodes = LiveNodes[Dcl];
      // Iterate all live nodes associated to the same declaration.
      for (auto Iter = Nodes.begin(); Iter != Nodes.end(); /*empty*/) {
        LiveNode &liveNode = *Iter;
        DepType Dep = getDep(opnd, liveNode);
        if (Dep == DepType::NODEP) {
          ++Iter;
        } else {
          auto DepRel = getDepAndRel(opnd, liveNode, Dep);
          if (DepRel.first != DepType::NODEP) {
            addEdge(curNode, liveNode.N, Dep);
            // Check if this kills current live node. If yes, remove it.
            bool pred = DepRel.second == G4_CmpRelation::Rel_eq ||
                        DepRel.second == G4_CmpRelation::Rel_gt;
            Iter = kill_if(pred, Nodes, Iter);
          } else
            ++Iter;
        }
      }
    }

    // If this is a physically allocated regvar, then check dependency on the
    // physically allocated live nodes. This should be a cold path.
    if (opnd->isPhysicallyAllocatedRegVar()) {
      for (auto Iter = LivePhysicalNodes.begin();
           Iter != LivePhysicalNodes.end();
           /*empty*/) {
        LiveNode &liveNode = *Iter;
        DepType Dep = getDep(opnd, liveNode);
        if (Dep == DepType::NODEP) {
          ++Iter;
        } else {
          auto DepRel = getDepAndRel(opnd, liveNode, Dep);
          if (DepRel.first != DepType::NODEP) {
            addEdge(curNode, liveNode.N, Dep);
            // Check if this kills current live node. If yes, remove it.
            bool pred = DepRel.second == G4_CmpRelation::Rel_eq ||
                        DepRel.second == G4_CmpRelation::Rel_gt;
            Iter = kill_if(pred, LivePhysicalNodes, Iter);
          } else
            ++Iter;
        }
      }
    }

    NewLiveOps.emplace_back(curNode, OpNum);
  }

  if (Inst->isPseudoAddrMovIntrinsic()) {
    for (auto OpNum :
         {Gen4_Operand_Number::Opnd_src0, Gen4_Operand_Number::Opnd_src1,
          Gen4_Operand_Number::Opnd_src2, Gen4_Operand_Number::Opnd_src3,
          Gen4_Operand_Number::Opnd_src4, Gen4_Operand_Number::Opnd_src5,
          Gen4_Operand_Number::Opnd_src6, Gen4_Operand_Number::Opnd_src7}) {
      G4_Operand *opnd = curNode->getInst()->getOperand(OpNum);
      if (opnd == nullptr || opnd->isNullReg())
        continue;
      G4_Declare *Dcl = opnd->asAddrExp()->getRegVar()->getDeclare();
      addSrcOpndDep(curNode, Dcl, OpNum);
    }
  } else {
    for (auto OpNum :
         {Gen4_Operand_Number::Opnd_src0, Gen4_Operand_Number::Opnd_src1,
          Gen4_Operand_Number::Opnd_src2, Gen4_Operand_Number::Opnd_src3,
          Gen4_Operand_Number::Opnd_pred,
          Gen4_Operand_Number::Opnd_implAccSrc}) {
      G4_Operand *opnd = curNode->getInst()->getOperand(OpNum);
      if (opnd == nullptr || opnd->getBase() == nullptr || opnd->isNullReg())
        continue;
      G4_Declare *Dcl = opnd->getTopDcl();
      vASSERT(Dcl || opnd->isPhysicallyAllocatedRegVar());
      addSrcOpndDep(curNode, Dcl, OpNum);
    }
  }
}

void preDDD::processSend(preNode *curNode) {
  G4_INST *Inst = curNode->getInst();
  if (!Inst->isSend())
    return;

  vISA_ASSERT(!Inst->getMsgDesc()->isScratch(), "not expected");
  for (auto Iter = LiveSends.begin(); Iter != LiveSends.end(); /*empty*/) {
    preNode *liveN = *Iter;
    DepType Dep = getDepSend(Inst, liveN->getInst(), BTIIsRestrict);
    if (Dep != DepType::NODEP) {
      addEdge(curNode, liveN, Dep);
      // Check if this kills current live send. If yes, remove it.
      bool pred = (Dep == DepType::WAW_MEMORY || Dep == DepType::RAW_MEMORY);
      Iter = kill_if(pred, LiveSends, Iter);
    } else
      ++Iter;
  }
}

void preDDD::prune() {
  auto removeEdge = [=](preNode *pred, preNode *succ) {
    auto Iter = std::find_if(pred->succ_begin(), pred->succ_end(),
                             [=](preEdge &E) { return E.getNode() == succ; });
    if (Iter == pred->succ_end())
      return;
    kill_if(true, pred->Succs, Iter);
    Iter = std::find_if(succ->pred_begin(), succ->pred_end(),
                        [=](preEdge &E) { return E.getNode() == pred; });
    vASSERT(Iter != succ->pred_end());
    kill_if(true, succ->Preds, Iter);
  };

  // Currently only prune up to two levels.
  for (auto N : SNodes) {
    std::set<preNode *> Seen;
    for (auto &E1 : N->Succs)
      for (auto &E2 : E1.getNode()->Succs)
        Seen.insert(E2.getNode());

    for (auto T : Seen)
      removeEdge(N, T);
  }
}

// Reset states that a scheduler may overwrite.
void preDDD::reset(bool ReassignNodeID) {
  if (!IsDagBuilt)
    buildGraph();

  // When instructcions are reordered, the node IDs may become
  // inconsistent. This is to ensure the following internal consistency:
  //
  // I0     N2
  // I1 ==> N1
  // I2     N0
  //
  // SNodes = { N0, N1, N2}
  //
  // as IDs may be used in node comparison.
  //
  if (ReassignNodeID) {
    m_BB->resetLocalIds();
    auto Cmp = [](const preNode *LHS, const preNode *RHS) {
      return LHS->Inst->getLocalId() > RHS->Inst->getLocalId();
    };
    std::sort(SNodes.begin(), SNodes.end(), Cmp);
    unsigned Id = 0;
    for (auto N : SNodes) {
      N->ID = Id++;
    }
  }

  auto isHalfN = [](G4_INST *Inst, unsigned N) -> bool {
    return Inst->isSend() && Inst->getExecSize() == g4::SIMD16 &&
           Inst->getMaskOffset() == N * 16;
  };

  auto isQuadN = [](G4_INST *Inst, unsigned N) -> bool {
    return Inst->isSend() && Inst->getExecSize() == g4::SIMD8 &&
           Inst->getMaskOffset() == N * 8;
  };

  auto isTupleLead = [&isHalfN, &isQuadN](G4_INST *Inst) -> bool {
    return isHalfN(Inst, 1) || isQuadN(Inst, 3);
  };

  auto isTuplePart = [&isHalfN, &isQuadN](G4_INST *Inst) -> bool {
    return isHalfN(Inst, 0) || isHalfN(Inst, 1) || isQuadN(Inst, 0) ||
           isQuadN(Inst, 1) || isQuadN(Inst, 2) || isQuadN(Inst, 3);
  };

  // Mark SIMD32 send tuples.
  preNode *Lead = nullptr;
  for (auto N : SNodes) {
    G4_INST *Inst = N->getInst();
    if (!Inst)
      continue;
    if (isTupleLead(Inst)) {
      // In case, send tuples are interleaved, bail out.
      if (Lead)
        break;
      Lead = N;
      N->setTupleLead(Lead);
      continue;
    }
    if (Lead && isTuplePart(Inst)) {
      N->setTupleLead(Lead);
      if (Inst->getMaskOffset() == 0)
        Lead = nullptr;
      continue;
    }
    // This send is neither a lead nor a part, which means this block is
    // already sliced. Bail out.
    if (Lead && Inst->isSend())
      break;
  }

  for (auto N : SNodes) {
    N->NumPredsLeft = unsigned(N->Preds.size());
    N->NumSuccsLeft = unsigned(N->Succs.size());
    N->isScheduled = false;
    N->setReadyCycle(0);
    N->isClustered = false;
    N->isClusterLead = false;
  }

  EntryNode.NumPredsLeft = 0;
  EntryNode.NumSuccsLeft = unsigned(EntryNode.Succs.size());
  EntryNode.isScheduled = false;
  EntryNode.setReadyCycle(0);
  EntryNode.isClustered = false;
  EntryNode.isClusterLead = false;

  ExitNode.NumPredsLeft = unsigned(ExitNode.Preds.size());
  ExitNode.NumSuccsLeft = 0;
  ExitNode.isScheduled = false;
  ExitNode.setReadyCycle(0);
  ExitNode.isClustered = false;
  ExitNode.isClusterLead = false;
}

void preDDD::dumpDagTxt(RegisterPressure &rp) {
  const char *asmFileName = "nullasm";
  getOptions()->getOption(VISA_AsmFileName, asmFileName);
  std::string fileName(asmFileName);
  fileName.append(".bb")
      .append(std::to_string(getBB()->getId()))
      .append(".preDDD.txt");
  std::fstream ofile(fileName, std::ios::out);

  std::vector<unsigned> LiveOutNodeIDs;
  std::set<vISA::G4_Declare *> LiveOutSet;
  // 1) dump node-id, dst-size, instruction, ...
  // nodes are ordered bottom-up from block exit
  for (auto N : SNodes) {
    // Node
    ofile << "NodeInfo, " << N->ID << ", ";
    if (N->getInst()) {
      // dst-size
      unsigned dclSize = 0;
      G4_INST *Inst = N->getInst();
      G4_DstRegRegion *Dst = Inst->getDst();
      if (!Inst->isPseudoKill() && Dst && Dst->getTopDcl()) {
        auto rootDcl = Dst->getTopDcl();
        dclSize = rootDcl->getByteSize();
        auto alignBytes = static_cast<uint32_t>(rootDcl->getSubRegAlign()) * 2;
        if (dclSize < alignBytes) {
          dclSize = std::min(dclSize * 2, alignBytes);
        }
        // first time seeing a a live-out variable, record the node-id
        if (rp.isLiveOut(m_BB, rootDcl) && !LiveOutSet.count(rootDcl)) {
          LiveOutNodeIDs.push_back(N->ID);
          LiveOutSet.insert(rootDcl);
        }
      }
      ofile << dclSize << ", ";
      // inst text
      N->getInst()->emit(ofile);
      ofile << "\n";
    } else
      ofile << "0, "
            << "null\n";
  }
  // 2) dump node-id then predecessor-ids
  for (auto N : SNodes) {
    // Node
    ofile << "PredInfo, " << N->ID;
    // Edge
    for (auto &E : N->Preds) {
      ofile << ", " << E.getNode()->ID;
    }
    ofile << "\n";
  }
  // 3) dump node-id then use-ids
  for (auto N : SNodes) {
    // Node
    ofile << "UseInfo, " << N->ID;
    if (N == &ExitNode) {
      for (auto OutID : LiveOutNodeIDs)
        ofile << ", " << OutID;
      ofile << "\n";
      continue;
    }
    // Edge
    for (auto &E : N->Preds) {
      DepType depType = E.getType();
      auto DefInst = E.getNode()->getInst();
      if (DefInst && !DefInst->isPseudoKill() &&
          (depType == RAW || depType == WAW))
        ofile << ", " << E.getNode()->ID;
    }
    ofile << "\n";
  }
  ofile.close();
}

void preDDD::dumpDagDot() {
  const char *asmFileName = "nullasm";
  getOptions()->getOption(VISA_AsmFileName, asmFileName);
  std::string fileName(asmFileName);
  fileName.append(".bb")
      .append(std::to_string(getBB()->getId()))
      .append(".preDDD.dot");

  std::fstream ofile(fileName, std::ios::out);
  ofile << "digraph DAG {"
        << "\n";

  for (auto N : SNodes) {
    // Node
    ofile << N->ID << "[label=\"";
    N->getInst()->emit(ofile);
    ofile << ", I" << N->getInst()->getLocalId() << "\"]\n";
    // Edge
    for (auto &E : N->Succs) {
      DepType depType = E.getType();
      const char *depColor, *depStr;
      std::tie(depColor, depStr) = (depType == RAW || depType == RAW_MEMORY)
                                       ? std::make_tuple("black", "RAW")
                                   : (depType == WAR || depType == WAR_MEMORY)
                                       ? std::make_tuple("red", "WAR")
                                   : (depType == WAW || depType == WAW_MEMORY)
                                       ? std::make_tuple("orange", "WAW")
                                       : std::make_tuple("grey", "other");

      // Example: 30->34[label="RAW",color="{red|black|yellow}"];
      ofile << N->ID << "->" << E.getNode()->ID << "[label=\"" << depStr << "\""
            << ",color=\"" << depColor << "\""
            << "];\n";
    }
  }

  ofile << "}\n";
  ofile.close();
}

namespace {
// Queue for Sethi-Ullman scheduling to reduce register pressure.
class SethiUllmanACCQueue {
  preDDD &ddd;

  // Sethi-Ullman numbers.
  std::vector<unsigned> Numbers;

  std::vector<preNode *> Q;

public:
  SethiUllmanACCQueue(preDDD &ddd, G4_Kernel *kernel) : ddd(ddd) {
    init(kernel);
  }

  // Add a new ready node.
  void push(preNode *N) { Q.push_back(N); }

  // Schedule the top node.
  preNode *pop() { return select(); }

  bool empty() const { return Q.empty(); }

  unsigned getNumber(unsigned i) { return Numbers[i]; }

private:
  // Compute the Sethi-Ullman number for a node.
  unsigned calculateSethiUllmanNumberForACC(preNode *N, G4_Kernel *kernel);

  // Initialize Sethi-Ullman numbers.
  void init(G4_Kernel *kernel);

  // Select next ready node to schedule.
  preNode *select();

  // Compare two ready nodes and decide which one should be scheduled first.
  // Return true if N2 has a higher priority than N1, false otherwise.
  bool compare(preNode *N1, preNode *N2);
};

// Scheduler instruction to increase the ACC substitution ratio on a single
// block.
class BB_ACC_Scheduler {
  // The kernel this block belongs to.
  G4_Kernel &kernel;

  // The data dependency graph for this block.
  preDDD &ddd;

  // The schedule result.
  std::vector<G4_INST *> schedule;

public:
  BB_ACC_Scheduler(G4_Kernel &kernel, preDDD &ddd) : kernel(kernel), ddd(ddd) {}

  G4_Kernel &getKernel() const { return kernel; }
  G4_BB *getBB() const { return ddd.getBB(); }

  // Run Sethi-Ullman scheduling.
  void scheduleBlockForACC() { SethiUllmanACCScheduling(); }

  // Commit the scheduling result.
  void commit();

private:
  void SethiUllmanACCScheduling();
  bool verifyScheduling();
};

} // namespace

//
// Basic generalized SU scheduling algorithm
//
void BB_ACC_Scheduler::SethiUllmanACCScheduling() {
  schedule.clear();
  SethiUllmanACCQueue Q(ddd, &kernel);
  Q.push(ddd.getExitNode());

  while (!Q.empty()) {
    preNode *N = Q.pop();
    vASSERT(!N->isScheduled && N->NumSuccsLeft == 0);
    if (N->getInst()) {
      VISA_DEBUG_VERBOSE({
        std::cerr << "SU[" << Q.getNumber(N->getID()) << "]:";
        N->dump();
      });
      schedule.push_back(N->getInst());
      N->isScheduled = true;
    }

    for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) {
      preNode *Node = I->getNode();
      vASSERT(!Node->isScheduled && Node->NumSuccsLeft);
      --Node->NumSuccsLeft;
      if (Node->NumSuccsLeft == 0)
        Q.push(Node);
    }
  }

  vASSERT(verifyScheduling());
}

void BB_ACC_Scheduler::commit() {
  INST_LIST &CurInsts = getBB()->getInstList();
  CurInsts.clear();

  // move the scheduled instruction to the instruction list.
  for (auto Inst : schedule) {
    CurInsts.push_front(Inst);
  }

  return;
}

bool BB_ACC_Scheduler::verifyScheduling() {
  std::set<G4_INST *> Insts;
  for (auto Inst : *(getBB()))
    Insts.insert(Inst);

  if (Insts.size() != schedule.size()) {
    return false;
  }

  for (auto Inst : schedule) {
    if (Insts.count(Inst) != 1) {
      Inst->dump();
      return false;
    }
  }

  return true;
}

preRA_ACC_Scheduler::preRA_ACC_Scheduler(G4_Kernel &k)
    : kernel(k), m_options(kernel.getOptions()) {}

preRA_ACC_Scheduler::~preRA_ACC_Scheduler() {}

bool preRA_ACC_Scheduler::run() {
  AccSubPass accSub(*kernel.fg.builder, kernel);

  for (auto bb : kernel.fg) {
    if (bb->size() < SMALL_BLOCK_SIZE || bb->size() > LARGE_BLOCK_SIZE) {
      // Skip small and large blocks.
      continue;
    }

    preDDD ddd(kernel, bb);
    SchedConfig config(0);
    BB_ACC_Scheduler S(kernel, ddd);

    ddd.buildGraphForACC();
    S.scheduleBlockForACC();
    S.commit();
    accSub.doAccSub(bb);
  }

  return true;
}

#define ACC_DEF_NODE_DEGREE 1
#define ACC_USE_NODE_DEGREE 5
#define NONE_ACC_NODE_DEGREE 20

//
// Generalizations of the Sethi-Ullman algorithm for register allocation
//
unsigned
SethiUllmanACCQueue::calculateSethiUllmanNumberForACC(preNode *N,
                                                      G4_Kernel *kernel) {
  vASSERT(N->getID() < Numbers.size());
  unsigned CurNum = Numbers[N->getID()];
  if (CurNum != 0)
    return CurNum;

  // Get the number of Pred nodes
  unsigned accPredNum = 0;
  std::vector<std::pair<preNode *, unsigned>> Preds;
  for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) {
    auto &Edge = *I;

    // Skip pseudo-kills as they are lifetime markers.
    auto predNode = Edge.getNode();
    auto type = Edge.getType();
    auto DefInst = predNode->getInst();
    if (!DefInst)
      continue;

    if (predNode->isACCCandidate() && type == DepType::RAW) {
      accPredNum++;
    }
    // Recurse on the predecessors.
    unsigned Num = calculateSethiUllmanNumberForACC(Edge.getNode(), kernel);
    Preds.emplace_back(Edge.getNode(), Num);
  }

  // If current node is not ACC candidate, but it's predecessor is, reduce the
  // degree to be scheduled first.
  CurNum = N->isACCCandidate() ? ACC_DEF_NODE_DEGREE
           : accPredNum        ? ACC_USE_NODE_DEGREE
                               : NONE_ACC_NODE_DEGREE;

  if (Preds.size() > 0) {
    std::sort(Preds.begin(), Preds.end(),
              [](std::pair<preNode *, unsigned> lhs,
                 std::pair<preNode *, unsigned> rhs) {
                return lhs.second < rhs.second;
              });
    // Add the minimal degree of the pred nodes
    CurNum = CurNum + Preds[0].second;
  }

  return CurNum;
}

void SethiUllmanACCQueue::init(G4_Kernel *kernel) {
  auto &Nodes = ddd.getNodes();
  unsigned N = (unsigned)Nodes.size();
  Numbers.resize(N, 0);
  for (unsigned i = 0; i < N; ++i) {
    unsigned j = N - 1 - i;
    Numbers[j] = calculateSethiUllmanNumberForACC(Nodes[j], kernel);
  }

  VISA_DEBUG_VERBOSE({
    std::cerr << "\n\n";
    for (auto I = Nodes.rbegin(); I != Nodes.rend(); ++I) {
      std::cerr << "SU[" << Numbers[(*I)->getID()] << "] "
                << ((*I)->isACCCandidate() ? "ACC " : "GRF ");
      (*I)->dump();
    }
    std::cerr << "\n\n";
  });
}

// Compare two ready nodes and decide which one should be scheduled first.
// Return true if N2 has a higher priority than N1, false otherwise.
bool SethiUllmanACCQueue::compare(preNode *N1, preNode *N2) {
  // TODO. Introduce heuristics before comparing SU numbers.
  vASSERT(N1->getID() < Numbers.size());
  vASSERT(N2->getID() < Numbers.size());
  vASSERT(N1->getID() != N2->getID());

  // Pseudo kill always has higher priority.
  if (N1->getInst()->isPseudoKill())
    return false;

  unsigned SU1 = Numbers[N1->getID()];
  unsigned SU2 = Numbers[N2->getID()];

  // This is a bottom-up scheduling. Smaller SU number means higher priority.
  if (SU1 < SU2)
    return false;

  if (SU1 > SU2)
    return true;

  // Otherwise, break tie with their IDs. Smaller ID means higher priority.
  return N1->getID() > N2->getID();
}

preNode *SethiUllmanACCQueue::select() {
  vASSERT(!Q.empty());
  auto TopIter = Q.end();
  for (auto I = Q.begin(), E = Q.end(); I != E; ++I) {
    if (TopIter == Q.end() || compare(*TopIter, *I))
      TopIter = I;
  }

  vASSERT(TopIter != Q.end());
  preNode *Top = *TopIter;
  std::swap(*TopIter, Q.back());
  Q.pop_back();

  return Top;
}

// Build the data dependency bottom up with two simple
// special nodes.
void preDDD::buildGraphForACC() {
  vASSERT(!IsDagBuilt);

  // Starts with the exit node.
  addNodeToGraph(&ExitNode);

  unsigned NumOfInsts = (unsigned)m_BB->size();
  SNodes.reserve(NumOfInsts);

  auto I = m_BB->rbegin(), E = m_BB->rend();
  for (unsigned i = 0; I != E; ++I) {
    preNode *N = new (preNodeAllocator) preNode(*I, i++);
    SNodes.push_back(N);
    if ((*I)->isSend()) {
      N->setBarrier(DepType::SEND_BARRIER);
    }
    addNodeToGraph(N);
    if ((*I)->canInstBeAcc(&kernel.fg.globalOpndHT)) {
      N->setACCCandidate();
    }
  }

  // Ends with the entry node.
  addNodeToGraph(&EntryNode);

  // prune the graph.
  prune();

  // Set DAG is complete.
  IsDagBuilt = true;

  // Initialize perNode data.
  reset();
}
