root/lj_opt_fold.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. LJFOLD
  2. LJFOLD
  3. LJFOLD
  4. LJFOLD
  5. LJFOLD
  6. LJFOLD
  7. kfold_intop
  8. LJFOLD
  9. LJFOLD
  10. LJFOLD
  11. LJFOLD
  12. LJFOLD
  13. LJFOLD
  14. kfold_int64arith
  15. LJFOLD
  16. LJFOLD
  17. LJFOLD
  18. LJFOLD
  19. LJFOLD
  20. LJFOLD
  21. LJFOLD
  22. LJFOLD
  23. LJFOLD
  24. LJFOLD
  25. LJFOLD
  26. LJFOLD
  27. LJFOLD
  28. LJFOLD
  29. LJFOLD
  30. LJFOLD
  31. LJFOLD
  32. LJFOLD
  33. LJFOLD
  34. LJFOLD
  35. LJFOLD
  36. LJFOLD
  37. LJFOLD
  38. LJFOLD
  39. LJFOLD
  40. LJFOLD
  41. LJFOLD
  42. LJFOLD
  43. LJFOLD
  44. LJFOLD
  45. LJFOLD
  46. LJFOLD
  47. LJFOLD
  48. LJFOLD
  49. LJFOLD
  50. LJFOLD
  51. LJFOLD
  52. LJFOLD
  53. LJFOLD
  54. LJFOLD
  55. LJFOLD
  56. LJFOLD
  57. LJFOLD
  58. LJFOLD
  59. LJFOLD
  60. LJFOLD
  61. LJFOLD
  62. LJFOLD
  63. LJFOLD
  64. LJFOLD
  65. LJFOLD
  66. LJFOLD
  67. LJFOLD
  68. LJFOLD
  69. LJFOLD
  70. LJFOLD
  71. LJFOLD
  72. LJFOLD
  73. LJFOLD
  74. LJFOLD
  75. LJFOLD
  76. LJFOLD
  77. LJFOLD
  78. LJFOLD
  79. LJFOLD
  80. LJFOLD
  81. LJFOLD
  82. LJFOLD
  83. simplify_intmul_k
  84. LJFOLD
  85. LJFOLD
  86. LJFOLD
  87. LJFOLD
  88. LJFOLD
  89. LJFOLD
  90. LJFOLD
  91. LJFOLD
  92. LJFOLD
  93. LJFOLD
  94. LJFOLD
  95. LJFOLD
  96. LJFOLD
  97. LJFOLD
  98. LJFOLD
  99. LJFOLD
  100. LJFOLD
  101. LJFOLD
  102. LJFOLD
  103. LJFOLD
  104. LJFOLD
  105. LJFOLD
  106. LJFOLD
  107. LJFOLD
  108. LJFOLD
  109. LJFOLD
  110. LJFOLD
  111. LJFOLD
  112. LJFOLD
  113. LJFOLD
  114. LJFOLD
  115. LJFOLD
  116. LJFOLD
  117. LJFOLD
  118. LJFOLD
  119. LJFOLD
  120. LJFOLD
  121. kfold_xload
  122. LJFOLD
  123. LJFOLD
  124. LJFOLD
  125. LJFOLD
  126. LJFOLD
  127. LJFOLD
  128. LJFOLD
  129. LJFOLD
  130. LJFOLD
  131. LJFOLD
  132. LJFOLD
  133. LJFOLD
  134. LJFOLD
  135. LJFOLD
  136. LJFOLD
  137. LJFOLD
  138. LJFOLD
  139. LJFOLD
  140. LJFOLD
  141. LJFOLD
  142. LJFOLD
  143. LJFOLD
  144. LJFOLD
  145. lj_opt_cse
  146. lj_opt_cselim

   1 /*
   2 ** FOLD: Constant Folding, Algebraic Simplifications and Reassociation.
   3 ** ABCelim: Array Bounds Check Elimination.
   4 ** CSE: Common-Subexpression Elimination.
   5 ** Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h
   6 */
   7 
   8 #define lj_opt_fold_c
   9 #define LUA_CORE
  10 
  11 #include <math.h>
  12 
  13 #include "lj_obj.h"
  14 
  15 #if LJ_HASJIT
  16 
  17 #include "lj_buf.h"
  18 #include "lj_str.h"
  19 #include "lj_tab.h"
  20 #include "lj_ir.h"
  21 #include "lj_jit.h"
  22 #include "lj_ircall.h"
  23 #include "lj_iropt.h"
  24 #include "lj_trace.h"
  25 #if LJ_HASFFI
  26 #include "lj_ctype.h"
  27 #include "lj_carith.h"
  28 #endif
  29 #include "lj_vm.h"
  30 #include "lj_strscan.h"
  31 #include "lj_strfmt.h"
  32 
  33 /* Here's a short description how the FOLD engine processes instructions:
  34 **
  35 ** The FOLD engine receives a single instruction stored in fins (J->fold.ins).
  36 ** The instruction and its operands are used to select matching fold rules.
  37 ** These are applied iteratively until a fixed point is reached.
  38 **
  39 ** The 8 bit opcode of the instruction itself plus the opcodes of the
  40 ** two instructions referenced by its operands form a 24 bit key
  41 ** 'ins left right' (unused operands -> 0, literals -> lowest 8 bits).
  42 **
  43 ** This key is used for partial matching against the fold rules. The
  44 ** left/right operand fields of the key are successively masked with
  45 ** the 'any' wildcard, from most specific to least specific:
  46 **
  47 **   ins left right
  48 **   ins any  right
  49 **   ins left any
  50 **   ins any  any
  51 **
  52 ** The masked key is used to lookup a matching fold rule in a semi-perfect
  53 ** hash table. If a matching rule is found, the related fold function is run.
  54 ** Multiple rules can share the same fold function. A fold rule may return
  55 ** one of several special values:
  56 **
  57 ** - NEXTFOLD means no folding was applied, because an additional test
  58 **   inside the fold function failed. Matching continues against less
  59 **   specific fold rules. Finally the instruction is passed on to CSE.
  60 **
  61 ** - RETRYFOLD means the instruction was modified in-place. Folding is
  62 **   retried as if this instruction had just been received.
  63 **
  64 ** All other return values are terminal actions -- no further folding is
  65 ** applied:
  66 **
  67 ** - INTFOLD(i) returns a reference to the integer constant i.
  68 **
  69 ** - LEFTFOLD and RIGHTFOLD return the left/right operand reference
  70 **   without emitting an instruction.
  71 **
  72 ** - CSEFOLD and EMITFOLD pass the instruction directly to CSE or emit
  73 **   it without passing through any further optimizations.
  74 **
  75 ** - FAILFOLD, DROPFOLD and CONDFOLD only apply to instructions which have
  76 **   no result (e.g. guarded assertions): FAILFOLD means the guard would
  77 **   always fail, i.e. the current trace is pointless. DROPFOLD means
  78 **   the guard is always true and has been eliminated. CONDFOLD is a
  79 **   shortcut for FAILFOLD + cond (i.e. drop if true, otherwise fail).
  80 **
  81 ** - Any other return value is interpreted as an IRRef or TRef. This
  82 **   can be a reference to an existing or a newly created instruction.
  83 **   Only the least-significant 16 bits (IRRef1) are used to form a TRef
  84 **   which is finally returned to the caller.
  85 **
  86 ** The FOLD engine receives instructions both from the trace recorder and
  87 ** substituted instructions from LOOP unrolling. This means all types
  88 ** of instructions may end up here, even though the recorder bypasses
  89 ** FOLD in some cases. Thus all loads, stores and allocations must have
  90 ** an any/any rule to avoid being passed on to CSE.
  91 **
  92 ** Carefully read the following requirements before adding or modifying
  93 ** any fold rules:
  94 **
  95 ** Requirement #1: All fold rules must preserve their destination type.
  96 **
  97 ** Consistently use INTFOLD() (KINT result) or lj_ir_knum() (KNUM result).
  98 ** Never use lj_ir_knumint() which can have either a KINT or KNUM result.
  99 **
 100 ** Requirement #2: Fold rules should not create *new* instructions which
 101 ** reference operands *across* PHIs.
 102 **
 103 ** E.g. a RETRYFOLD with 'fins->op1 = fleft->op1' is invalid if the
 104 ** left operand is a PHI. Then fleft->op1 would point across the PHI
 105 ** frontier to an invariant instruction. Adding a PHI for this instruction
 106 ** would be counterproductive. The solution is to add a barrier which
 107 ** prevents folding across PHIs, i.e. 'PHIBARRIER(fleft)' in this case.
 108 ** The only exception is for recurrences with high latencies like
 109 ** repeated int->num->int conversions.
 110 **
 111 ** One could relax this condition a bit if the referenced instruction is
 112 ** a PHI, too. But this often leads to worse code due to excessive
 113 ** register shuffling.
 114 **
 115 ** Note: returning *existing* instructions (e.g. LEFTFOLD) is ok, though.
 116 ** Even returning fleft->op1 would be ok, because a new PHI will added,
 117 ** if needed. But again, this leads to excessive register shuffling and
 118 ** should be avoided.
 119 **
 120 ** Requirement #3: The set of all fold rules must be monotonic to guarantee
 121 ** termination.
 122 **
 123 ** The goal is optimization, so one primarily wants to add strength-reducing
 124 ** rules. This means eliminating an instruction or replacing an instruction
 125 ** with one or more simpler instructions. Don't add fold rules which point
 126 ** into the other direction.
 127 **
 128 ** Some rules (like commutativity) do not directly reduce the strength of
 129 ** an instruction, but enable other fold rules (e.g. by moving constants
 130 ** to the right operand). These rules must be made unidirectional to avoid
 131 ** cycles.
 132 **
 133 ** Rule of thumb: the trace recorder expands the IR and FOLD shrinks it.
 134 */
 135 
 136 /* Some local macros to save typing. Undef'd at the end. */
 137 #define IR(ref)         (&J->cur.ir[(ref)])
 138 #define fins            (&J->fold.ins)
 139 #define fleft           (J->fold.left)
 140 #define fright          (J->fold.right)
 141 #define knumleft        (ir_knum(fleft)->n)
 142 #define knumright       (ir_knum(fright)->n)
 143 
 144 /* Pass IR on to next optimization in chain (FOLD). */
 145 #define emitir(ot, a, b)        (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
 146 
 147 /* Fold function type. Fastcall on x86 significantly reduces their size. */
 148 typedef IRRef (LJ_FASTCALL *FoldFunc)(jit_State *J);
 149 
 150 /* Macros for the fold specs, so buildvm can recognize them. */
 151 #define LJFOLD(x)
 152 #define LJFOLDX(x)
 153 #define LJFOLDF(name)   static TRef LJ_FASTCALL fold_##name(jit_State *J)
 154 /* Note: They must be at the start of a line or buildvm ignores them! */
 155 
 156 /* Barrier to prevent using operands across PHIs. */
 157 #define PHIBARRIER(ir)  if (irt_isphi((ir)->t)) return NEXTFOLD
 158 
 159 /* Barrier to prevent folding across a GC step.
 160 ** GC steps can only happen at the head of a trace and at LOOP.
 161 ** And the GC is only driven forward if there's at least one allocation.
 162 */
 163 #define gcstep_barrier(J, ref) \
 164   ((ref) < J->chain[IR_LOOP] && \
 165    (J->chain[IR_SNEW] || J->chain[IR_XSNEW] || \
 166     J->chain[IR_TNEW] || J->chain[IR_TDUP] || \
 167     J->chain[IR_CNEW] || J->chain[IR_CNEWI] || \
 168     J->chain[IR_BUFSTR] || J->chain[IR_TOSTR] || J->chain[IR_CALLA]))
 169 
 170 /* -- Constant folding for FP numbers ------------------------------------- */
 171 
 172 LJFOLD(ADD KNUM KNUM)
 173 LJFOLD(SUB KNUM KNUM)
 174 LJFOLD(MUL KNUM KNUM)
 175 LJFOLD(DIV KNUM KNUM)
 176 LJFOLD(ATAN2 KNUM KNUM)
 177 LJFOLD(LDEXP KNUM KNUM)
 178 LJFOLD(MIN KNUM KNUM)
 179 LJFOLD(MAX KNUM KNUM)
 180 LJFOLDF(kfold_numarith)
 181 {
 182   lua_Number a = knumleft;
 183   lua_Number b = knumright;
 184   lua_Number y = lj_vm_foldarith(a, b, fins->o - IR_ADD);
 185   return lj_ir_knum(J, y);
 186 }
 187 
 188 LJFOLD(NEG KNUM FLOAD)
 189 LJFOLD(ABS KNUM FLOAD)
 190 LJFOLDF(kfold_numabsneg)
 191 {
 192   lua_Number a = knumleft;
 193   lua_Number y = lj_vm_foldarith(a, a, fins->o - IR_ADD);
 194   return lj_ir_knum(J, y);
 195 }
 196 
 197 LJFOLD(LDEXP KNUM KINT)
 198 LJFOLDF(kfold_ldexp)
 199 {
 200 #if LJ_TARGET_X86ORX64
 201   UNUSED(J);
 202   return NEXTFOLD;
 203 #else
 204   return lj_ir_knum(J, ldexp(knumleft, fright->i));
 205 #endif
 206 }
 207 
 208 LJFOLD(FPMATH KNUM any)
 209 LJFOLDF(kfold_fpmath)
 210 {
 211   lua_Number a = knumleft;
 212   lua_Number y = lj_vm_foldfpm(a, fins->op2);
 213   return lj_ir_knum(J, y);
 214 }
 215 
 216 LJFOLD(POW KNUM KINT)
 217 LJFOLDF(kfold_numpow)
 218 {
 219   lua_Number a = knumleft;
 220   lua_Number b = (lua_Number)fright->i;
 221   lua_Number y = lj_vm_foldarith(a, b, IR_POW - IR_ADD);
 222   return lj_ir_knum(J, y);
 223 }
 224 
 225 /* Must not use kfold_kref for numbers (could be NaN). */
 226 LJFOLD(EQ KNUM KNUM)
 227 LJFOLD(NE KNUM KNUM)
 228 LJFOLD(LT KNUM KNUM)
 229 LJFOLD(GE KNUM KNUM)
 230 LJFOLD(LE KNUM KNUM)
 231 LJFOLD(GT KNUM KNUM)
 232 LJFOLD(ULT KNUM KNUM)
 233 LJFOLD(UGE KNUM KNUM)
 234 LJFOLD(ULE KNUM KNUM)
 235 LJFOLD(UGT KNUM KNUM)
 236 LJFOLDF(kfold_numcomp)
 237 {
 238   return CONDFOLD(lj_ir_numcmp(knumleft, knumright, (IROp)fins->o));
 239 }
 240 
 241 /* -- Constant folding for 32 bit integers -------------------------------- */
 242 
 243 static int32_t kfold_intop(int32_t k1, int32_t k2, IROp op)
 244 {
 245   switch (op) {
 246   case IR_ADD: k1 += k2; break;
 247   case IR_SUB: k1 -= k2; break;
 248   case IR_MUL: k1 *= k2; break;
 249   case IR_MOD: k1 = lj_vm_modi(k1, k2); break;
 250   case IR_NEG: k1 = -k1; break;
 251   case IR_BAND: k1 &= k2; break;
 252   case IR_BOR: k1 |= k2; break;
 253   case IR_BXOR: k1 ^= k2; break;
 254   case IR_BSHL: k1 <<= (k2 & 31); break;
 255   case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 31)); break;
 256   case IR_BSAR: k1 >>= (k2 & 31); break;
 257   case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 31)); break;
 258   case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 31)); break;
 259   case IR_MIN: k1 = k1 < k2 ? k1 : k2; break;
 260   case IR_MAX: k1 = k1 > k2 ? k1 : k2; break;
 261   default: lua_assert(0); break;
 262   }
 263   return k1;
 264 }
 265 
 266 LJFOLD(ADD KINT KINT)
 267 LJFOLD(SUB KINT KINT)
 268 LJFOLD(MUL KINT KINT)
 269 LJFOLD(MOD KINT KINT)
 270 LJFOLD(NEG KINT KINT)
 271 LJFOLD(BAND KINT KINT)
 272 LJFOLD(BOR KINT KINT)
 273 LJFOLD(BXOR KINT KINT)
 274 LJFOLD(BSHL KINT KINT)
 275 LJFOLD(BSHR KINT KINT)
 276 LJFOLD(BSAR KINT KINT)
 277 LJFOLD(BROL KINT KINT)
 278 LJFOLD(BROR KINT KINT)
 279 LJFOLD(MIN KINT KINT)
 280 LJFOLD(MAX KINT KINT)
 281 LJFOLDF(kfold_intarith)
 282 {
 283   return INTFOLD(kfold_intop(fleft->i, fright->i, (IROp)fins->o));
 284 }
 285 
 286 LJFOLD(ADDOV KINT KINT)
 287 LJFOLD(SUBOV KINT KINT)
 288 LJFOLD(MULOV KINT KINT)
 289 LJFOLDF(kfold_intovarith)
 290 {
 291   lua_Number n = lj_vm_foldarith((lua_Number)fleft->i, (lua_Number)fright->i,
 292                                  fins->o - IR_ADDOV);
 293   int32_t k = lj_num2int(n);
 294   if (n != (lua_Number)k)
 295     return FAILFOLD;
 296   return INTFOLD(k);
 297 }
 298 
 299 LJFOLD(BNOT KINT)
 300 LJFOLDF(kfold_bnot)
 301 {
 302   return INTFOLD(~fleft->i);
 303 }
 304 
 305 LJFOLD(BSWAP KINT)
 306 LJFOLDF(kfold_bswap)
 307 {
 308   return INTFOLD((int32_t)lj_bswap((uint32_t)fleft->i));
 309 }
 310 
 311 LJFOLD(LT KINT KINT)
 312 LJFOLD(GE KINT KINT)
 313 LJFOLD(LE KINT KINT)
 314 LJFOLD(GT KINT KINT)
 315 LJFOLD(ULT KINT KINT)
 316 LJFOLD(UGE KINT KINT)
 317 LJFOLD(ULE KINT KINT)
 318 LJFOLD(UGT KINT KINT)
 319 LJFOLD(ABC KINT KINT)
 320 LJFOLDF(kfold_intcomp)
 321 {
 322   int32_t a = fleft->i, b = fright->i;
 323   switch ((IROp)fins->o) {
 324   case IR_LT: return CONDFOLD(a < b);
 325   case IR_GE: return CONDFOLD(a >= b);
 326   case IR_LE: return CONDFOLD(a <= b);
 327   case IR_GT: return CONDFOLD(a > b);
 328   case IR_ULT: return CONDFOLD((uint32_t)a < (uint32_t)b);
 329   case IR_UGE: return CONDFOLD((uint32_t)a >= (uint32_t)b);
 330   case IR_ULE: return CONDFOLD((uint32_t)a <= (uint32_t)b);
 331   case IR_ABC:
 332   case IR_UGT: return CONDFOLD((uint32_t)a > (uint32_t)b);
 333   default: lua_assert(0); return FAILFOLD;
 334   }
 335 }
 336 
 337 LJFOLD(UGE any KINT)
 338 LJFOLDF(kfold_intcomp0)
 339 {
 340   if (fright->i == 0)
 341     return DROPFOLD;
 342   return NEXTFOLD;
 343 }
 344 
 345 /* -- Constant folding for 64 bit integers -------------------------------- */
 346 
 347 static uint64_t kfold_int64arith(uint64_t k1, uint64_t k2, IROp op)
 348 {
 349   switch (op) {
 350 #if LJ_HASFFI
 351   case IR_ADD: k1 += k2; break;
 352   case IR_SUB: k1 -= k2; break;
 353   case IR_MUL: k1 *= k2; break;
 354   case IR_BAND: k1 &= k2; break;
 355   case IR_BOR: k1 |= k2; break;
 356   case IR_BXOR: k1 ^= k2; break;
 357   case IR_BSHL: k1 <<= (k2 & 63); break;
 358   case IR_BSHR: k1 = (int32_t)((uint32_t)k1 >> (k2 & 63)); break;
 359   case IR_BSAR: k1 >>= (k2 & 63); break;
 360   case IR_BROL: k1 = (int32_t)lj_rol((uint32_t)k1, (k2 & 63)); break;
 361   case IR_BROR: k1 = (int32_t)lj_ror((uint32_t)k1, (k2 & 63)); break;
 362 #endif
 363   default: UNUSED(k2); lua_assert(0); break;
 364   }
 365   return k1;
 366 }
 367 
 368 LJFOLD(ADD KINT64 KINT64)
 369 LJFOLD(SUB KINT64 KINT64)
 370 LJFOLD(MUL KINT64 KINT64)
 371 LJFOLD(BAND KINT64 KINT64)
 372 LJFOLD(BOR KINT64 KINT64)
 373 LJFOLD(BXOR KINT64 KINT64)
 374 LJFOLDF(kfold_int64arith)
 375 {
 376   return INT64FOLD(kfold_int64arith(ir_k64(fleft)->u64,
 377                                     ir_k64(fright)->u64, (IROp)fins->o));
 378 }
 379 
 380 LJFOLD(DIV KINT64 KINT64)
 381 LJFOLD(MOD KINT64 KINT64)
 382 LJFOLD(POW KINT64 KINT64)
 383 LJFOLDF(kfold_int64arith2)
 384 {
 385 #if LJ_HASFFI
 386   uint64_t k1 = ir_k64(fleft)->u64, k2 = ir_k64(fright)->u64;
 387   if (irt_isi64(fins->t)) {
 388     k1 = fins->o == IR_DIV ? lj_carith_divi64((int64_t)k1, (int64_t)k2) :
 389          fins->o == IR_MOD ? lj_carith_modi64((int64_t)k1, (int64_t)k2) :
 390                              lj_carith_powi64((int64_t)k1, (int64_t)k2);
 391   } else {
 392     k1 = fins->o == IR_DIV ? lj_carith_divu64(k1, k2) :
 393          fins->o == IR_MOD ? lj_carith_modu64(k1, k2) :
 394                              lj_carith_powu64(k1, k2);
 395   }
 396   return INT64FOLD(k1);
 397 #else
 398   UNUSED(J); lua_assert(0); return FAILFOLD;
 399 #endif
 400 }
 401 
 402 LJFOLD(BSHL KINT64 KINT)
 403 LJFOLD(BSHR KINT64 KINT)
 404 LJFOLD(BSAR KINT64 KINT)
 405 LJFOLD(BROL KINT64 KINT)
 406 LJFOLD(BROR KINT64 KINT)
 407 LJFOLDF(kfold_int64shift)
 408 {
 409 #if LJ_HASFFI
 410   uint64_t k = ir_k64(fleft)->u64;
 411   int32_t sh = (fright->i & 63);
 412   return INT64FOLD(lj_carith_shift64(k, sh, fins->o - IR_BSHL));
 413 #else
 414   UNUSED(J); lua_assert(0); return FAILFOLD;
 415 #endif
 416 }
 417 
 418 LJFOLD(BNOT KINT64)
 419 LJFOLDF(kfold_bnot64)
 420 {
 421 #if LJ_HASFFI
 422   return INT64FOLD(~ir_k64(fleft)->u64);
 423 #else
 424   UNUSED(J); lua_assert(0); return FAILFOLD;
 425 #endif
 426 }
 427 
 428 LJFOLD(BSWAP KINT64)
 429 LJFOLDF(kfold_bswap64)
 430 {
 431 #if LJ_HASFFI
 432   return INT64FOLD(lj_bswap64(ir_k64(fleft)->u64));
 433 #else
 434   UNUSED(J); lua_assert(0); return FAILFOLD;
 435 #endif
 436 }
 437 
 438 LJFOLD(LT KINT64 KINT64)
 439 LJFOLD(GE KINT64 KINT64)
 440 LJFOLD(LE KINT64 KINT64)
 441 LJFOLD(GT KINT64 KINT64)
 442 LJFOLD(ULT KINT64 KINT64)
 443 LJFOLD(UGE KINT64 KINT64)
 444 LJFOLD(ULE KINT64 KINT64)
 445 LJFOLD(UGT KINT64 KINT64)
 446 LJFOLDF(kfold_int64comp)
 447 {
 448 #if LJ_HASFFI
 449   uint64_t a = ir_k64(fleft)->u64, b = ir_k64(fright)->u64;
 450   switch ((IROp)fins->o) {
 451   case IR_LT: return CONDFOLD((int64_t)a < (int64_t)b);
 452   case IR_GE: return CONDFOLD((int64_t)a >= (int64_t)b);
 453   case IR_LE: return CONDFOLD((int64_t)a <= (int64_t)b);
 454   case IR_GT: return CONDFOLD((int64_t)a > (int64_t)b);
 455   case IR_ULT: return CONDFOLD(a < b);
 456   case IR_UGE: return CONDFOLD(a >= b);
 457   case IR_ULE: return CONDFOLD(a <= b);
 458   case IR_UGT: return CONDFOLD(a > b);
 459   default: lua_assert(0); return FAILFOLD;
 460   }
 461 #else
 462   UNUSED(J); lua_assert(0); return FAILFOLD;
 463 #endif
 464 }
 465 
 466 LJFOLD(UGE any KINT64)
 467 LJFOLDF(kfold_int64comp0)
 468 {
 469 #if LJ_HASFFI
 470   if (ir_k64(fright)->u64 == 0)
 471     return DROPFOLD;
 472   return NEXTFOLD;
 473 #else
 474   UNUSED(J); lua_assert(0); return FAILFOLD;
 475 #endif
 476 }
 477 
 478 /* -- Constant folding for strings ---------------------------------------- */
 479 
 480 LJFOLD(SNEW KKPTR KINT)
 481 LJFOLDF(kfold_snew_kptr)
 482 {
 483   GCstr *s = lj_str_new(J->L, (const char *)ir_kptr(fleft), (size_t)fright->i);
 484   return lj_ir_kstr(J, s);
 485 }
 486 
 487 LJFOLD(SNEW any KINT)
 488 LJFOLDF(kfold_snew_empty)
 489 {
 490   if (fright->i == 0)
 491     return lj_ir_kstr(J, &J2G(J)->strempty);
 492   return NEXTFOLD;
 493 }
 494 
 495 LJFOLD(STRREF KGC KINT)
 496 LJFOLDF(kfold_strref)
 497 {
 498   GCstr *str = ir_kstr(fleft);
 499   lua_assert((MSize)fright->i <= str->len);
 500   return lj_ir_kkptr(J, (char *)strdata(str) + fright->i);
 501 }
 502 
 503 LJFOLD(STRREF SNEW any)
 504 LJFOLDF(kfold_strref_snew)
 505 {
 506   PHIBARRIER(fleft);
 507   if (irref_isk(fins->op2) && fright->i == 0) {
 508     return fleft->op1;  /* strref(snew(ptr, len), 0) ==> ptr */
 509   } else {
 510     /* Reassociate: strref(snew(strref(str, a), len), b) ==> strref(str, a+b) */
 511     IRIns *ir = IR(fleft->op1);
 512     if (ir->o == IR_STRREF) {
 513       IRRef1 str = ir->op1;  /* IRIns * is not valid across emitir. */
 514       PHIBARRIER(ir);
 515       fins->op2 = emitir(IRTI(IR_ADD), ir->op2, fins->op2); /* Clobbers fins! */
 516       fins->op1 = str;
 517       fins->ot = IRT(IR_STRREF, IRT_PGC);
 518       return RETRYFOLD;
 519     }
 520   }
 521   return NEXTFOLD;
 522 }
 523 
 524 LJFOLD(CALLN CARG IRCALL_lj_str_cmp)
 525 LJFOLDF(kfold_strcmp)
 526 {
 527   if (irref_isk(fleft->op1) && irref_isk(fleft->op2)) {
 528     GCstr *a = ir_kstr(IR(fleft->op1));
 529     GCstr *b = ir_kstr(IR(fleft->op2));
 530     return INTFOLD(lj_str_cmp(a, b));
 531   }
 532   return NEXTFOLD;
 533 }
 534 
 535 /* -- Constant folding and forwarding for buffers ------------------------- */
 536 
 537 /*
 538 ** Buffer ops perform stores, but their effect is limited to the buffer
 539 ** itself. Also, buffer ops are chained: a use of an op implies a use of
 540 ** all other ops up the chain. Conversely, if an op is unused, all ops
 541 ** up the chain can go unsed. This largely eliminates the need to treat
 542 ** them as stores.
 543 **
 544 ** Alas, treating them as normal (IRM_N) ops doesn't work, because they
 545 ** cannot be CSEd in isolation. CSE for IRM_N is implicitly done in LOOP
 546 ** or if FOLD is disabled.
 547 **
 548 ** The compromise is to declare them as loads, emit them like stores and
 549 ** CSE whole chains manually when the BUFSTR is to be emitted. Any chain
 550 ** fragments left over from CSE are eliminated by DCE.
 551 */
 552 
 553 /* BUFHDR is emitted like a store, see below. */
 554 
 555 LJFOLD(BUFPUT BUFHDR BUFSTR)
 556 LJFOLDF(bufput_append)
 557 {
 558   /* New buffer, no other buffer op inbetween and same buffer? */
 559   if ((J->flags & JIT_F_OPT_FWD) &&
 560       !(fleft->op2 & IRBUFHDR_APPEND) &&
 561       fleft->prev == fright->op2 &&
 562       fleft->op1 == IR(fright->op2)->op1) {
 563     IRRef ref = fins->op1;
 564     IR(ref)->op2 = (fleft->op2 | IRBUFHDR_APPEND);  /* Modify BUFHDR. */
 565     IR(ref)->op1 = fright->op1;
 566     return ref;
 567   }
 568   return EMITFOLD;  /* Always emit, CSE later. */
 569 }
 570 
 571 LJFOLD(BUFPUT any any)
 572 LJFOLDF(bufput_kgc)
 573 {
 574   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fright->o == IR_KGC) {
 575     GCstr *s2 = ir_kstr(fright);
 576     if (s2->len == 0) {  /* Empty string? */
 577       return LEFTFOLD;
 578     } else {
 579       if (fleft->o == IR_BUFPUT && irref_isk(fleft->op2) &&
 580           !irt_isphi(fleft->t)) {  /* Join two constant string puts in a row. */
 581         GCstr *s1 = ir_kstr(IR(fleft->op2));
 582         IRRef kref = lj_ir_kstr(J, lj_buf_cat2str(J->L, s1, s2));
 583         /* lj_ir_kstr() may realloc the IR and invalidates any IRIns *. */
 584         IR(fins->op1)->op2 = kref;  /* Modify previous BUFPUT. */
 585         return fins->op1;
 586       }
 587     }
 588   }
 589   return EMITFOLD;  /* Always emit, CSE later. */
 590 }
 591 
 592 LJFOLD(BUFSTR any any)
 593 LJFOLDF(bufstr_kfold_cse)
 594 {
 595   lua_assert(fleft->o == IR_BUFHDR || fleft->o == IR_BUFPUT ||
 596              fleft->o == IR_CALLL);
 597   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
 598     if (fleft->o == IR_BUFHDR) {  /* No put operations? */
 599       if (!(fleft->op2 & IRBUFHDR_APPEND))  /* Empty buffer? */
 600         return lj_ir_kstr(J, &J2G(J)->strempty);
 601       fins->op1 = fleft->op1;
 602       fins->op2 = fleft->prev;  /* Relies on checks in bufput_append. */
 603       return CSEFOLD;
 604     } else if (fleft->o == IR_BUFPUT) {
 605       IRIns *irb = IR(fleft->op1);
 606       if (irb->o == IR_BUFHDR && !(irb->op2 & IRBUFHDR_APPEND))
 607         return fleft->op2;  /* Shortcut for a single put operation. */
 608     }
 609   }
 610   /* Try to CSE the whole chain. */
 611   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
 612     IRRef ref = J->chain[IR_BUFSTR];
 613     while (ref) {
 614       IRIns *irs = IR(ref), *ira = fleft, *irb = IR(irs->op1);
 615       while (ira->o == irb->o && ira->op2 == irb->op2) {
 616         lua_assert(ira->o == IR_BUFHDR || ira->o == IR_BUFPUT ||
 617                    ira->o == IR_CALLL || ira->o == IR_CARG);
 618         if (ira->o == IR_BUFHDR && !(ira->op2 & IRBUFHDR_APPEND))
 619           return ref;  /* CSE succeeded. */
 620         if (ira->o == IR_CALLL && ira->op2 == IRCALL_lj_buf_puttab)
 621           break;
 622         ira = IR(ira->op1);
 623         irb = IR(irb->op1);
 624       }
 625       ref = irs->prev;
 626     }
 627   }
 628   return EMITFOLD;  /* No CSE possible. */
 629 }
 630 
 631 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_reverse)
 632 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_upper)
 633 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_lower)
 634 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putquoted)
 635 LJFOLDF(bufput_kfold_op)
 636 {
 637   if (irref_isk(fleft->op2)) {
 638     const CCallInfo *ci = &lj_ir_callinfo[fins->op2];
 639     SBuf *sb = lj_buf_tmp_(J->L);
 640     sb = ((SBuf * (LJ_FASTCALL *)(SBuf *, GCstr *))ci->func)(sb,
 641                                                        ir_kstr(IR(fleft->op2)));
 642     fins->o = IR_BUFPUT;
 643     fins->op1 = fleft->op1;
 644     fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
 645     return RETRYFOLD;
 646   }
 647   return EMITFOLD;  /* Always emit, CSE later. */
 648 }
 649 
 650 LJFOLD(CALLL CARG IRCALL_lj_buf_putstr_rep)
 651 LJFOLDF(bufput_kfold_rep)
 652 {
 653   if (irref_isk(fleft->op2)) {
 654     IRIns *irc = IR(fleft->op1);
 655     if (irref_isk(irc->op2)) {
 656       SBuf *sb = lj_buf_tmp_(J->L);
 657       sb = lj_buf_putstr_rep(sb, ir_kstr(IR(irc->op2)), IR(fleft->op2)->i);
 658       fins->o = IR_BUFPUT;
 659       fins->op1 = irc->op1;
 660       fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
 661       return RETRYFOLD;
 662     }
 663   }
 664   return EMITFOLD;  /* Always emit, CSE later. */
 665 }
 666 
 667 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfxint)
 668 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum_int)
 669 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum_uint)
 670 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfnum)
 671 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfstr)
 672 LJFOLD(CALLL CARG IRCALL_lj_strfmt_putfchar)
 673 LJFOLDF(bufput_kfold_fmt)
 674 {
 675   IRIns *irc = IR(fleft->op1);
 676   lua_assert(irref_isk(irc->op2));  /* SFormat must be const. */
 677   if (irref_isk(fleft->op2)) {
 678     SFormat sf = (SFormat)IR(irc->op2)->i;
 679     IRIns *ira = IR(fleft->op2);
 680     SBuf *sb = lj_buf_tmp_(J->L);
 681     switch (fins->op2) {
 682     case IRCALL_lj_strfmt_putfxint:
 683       sb = lj_strfmt_putfxint(sb, sf, ir_k64(ira)->u64);
 684       break;
 685     case IRCALL_lj_strfmt_putfstr:
 686       sb = lj_strfmt_putfstr(sb, sf, ir_kstr(ira));
 687       break;
 688     case IRCALL_lj_strfmt_putfchar:
 689       sb = lj_strfmt_putfchar(sb, sf, ira->i);
 690       break;
 691     case IRCALL_lj_strfmt_putfnum_int:
 692     case IRCALL_lj_strfmt_putfnum_uint:
 693     case IRCALL_lj_strfmt_putfnum:
 694     default: {
 695       const CCallInfo *ci = &lj_ir_callinfo[fins->op2];
 696       sb = ((SBuf * (*)(SBuf *, SFormat, lua_Number))ci->func)(sb, sf,
 697                                                          ir_knum(ira)->n);
 698       break;
 699       }
 700     }
 701     fins->o = IR_BUFPUT;
 702     fins->op1 = irc->op1;
 703     fins->op2 = lj_ir_kstr(J, lj_buf_tostr(sb));
 704     return RETRYFOLD;
 705   }
 706   return EMITFOLD;  /* Always emit, CSE later. */
 707 }
 708 
 709 /* -- Constant folding of pointer arithmetic ------------------------------ */
 710 
 711 LJFOLD(ADD KGC KINT)
 712 LJFOLD(ADD KGC KINT64)
 713 LJFOLDF(kfold_add_kgc)
 714 {
 715   GCobj *o = ir_kgc(fleft);
 716 #if LJ_64
 717   ptrdiff_t ofs = (ptrdiff_t)ir_kint64(fright)->u64;
 718 #else
 719   ptrdiff_t ofs = fright->i;
 720 #endif
 721 #if LJ_HASFFI
 722   if (irt_iscdata(fleft->t)) {
 723     CType *ct = ctype_raw(ctype_ctsG(J2G(J)), gco2cd(o)->ctypeid);
 724     if (ctype_isnum(ct->info) || ctype_isenum(ct->info) ||
 725         ctype_isptr(ct->info) || ctype_isfunc(ct->info) ||
 726         ctype_iscomplex(ct->info) || ctype_isvector(ct->info))
 727       return lj_ir_kkptr(J, (char *)o + ofs);
 728   }
 729 #endif
 730   return lj_ir_kptr(J, (char *)o + ofs);
 731 }
 732 
 733 LJFOLD(ADD KPTR KINT)
 734 LJFOLD(ADD KPTR KINT64)
 735 LJFOLD(ADD KKPTR KINT)
 736 LJFOLD(ADD KKPTR KINT64)
 737 LJFOLDF(kfold_add_kptr)
 738 {
 739   void *p = ir_kptr(fleft);
 740 #if LJ_64
 741   ptrdiff_t ofs = (ptrdiff_t)ir_kint64(fright)->u64;
 742 #else
 743   ptrdiff_t ofs = fright->i;
 744 #endif
 745   return lj_ir_kptr_(J, fleft->o, (char *)p + ofs);
 746 }
 747 
 748 LJFOLD(ADD any KGC)
 749 LJFOLD(ADD any KPTR)
 750 LJFOLD(ADD any KKPTR)
 751 LJFOLDF(kfold_add_kright)
 752 {
 753   if (fleft->o == IR_KINT || fleft->o == IR_KINT64) {
 754     IRRef1 tmp = fins->op1; fins->op1 = fins->op2; fins->op2 = tmp;
 755     return RETRYFOLD;
 756   }
 757   return NEXTFOLD;
 758 }
 759 
 760 /* -- Constant folding of conversions ------------------------------------- */
 761 
 762 LJFOLD(TOBIT KNUM KNUM)
 763 LJFOLDF(kfold_tobit)
 764 {
 765   return INTFOLD(lj_num2bit(knumleft));
 766 }
 767 
 768 LJFOLD(CONV KINT IRCONV_NUM_INT)
 769 LJFOLDF(kfold_conv_kint_num)
 770 {
 771   return lj_ir_knum(J, (lua_Number)fleft->i);
 772 }
 773 
 774 LJFOLD(CONV KINT IRCONV_NUM_U32)
 775 LJFOLDF(kfold_conv_kintu32_num)
 776 {
 777   return lj_ir_knum(J, (lua_Number)(uint32_t)fleft->i);
 778 }
 779 
 780 LJFOLD(CONV KINT IRCONV_INT_I8)
 781 LJFOLD(CONV KINT IRCONV_INT_U8)
 782 LJFOLD(CONV KINT IRCONV_INT_I16)
 783 LJFOLD(CONV KINT IRCONV_INT_U16)
 784 LJFOLDF(kfold_conv_kint_ext)
 785 {
 786   int32_t k = fleft->i;
 787   if ((fins->op2 & IRCONV_SRCMASK) == IRT_I8) k = (int8_t)k;
 788   else if ((fins->op2 & IRCONV_SRCMASK) == IRT_U8) k = (uint8_t)k;
 789   else if ((fins->op2 & IRCONV_SRCMASK) == IRT_I16) k = (int16_t)k;
 790   else k = (uint16_t)k;
 791   return INTFOLD(k);
 792 }
 793 
 794 LJFOLD(CONV KINT IRCONV_I64_INT)
 795 LJFOLD(CONV KINT IRCONV_U64_INT)
 796 LJFOLD(CONV KINT IRCONV_I64_U32)
 797 LJFOLD(CONV KINT IRCONV_U64_U32)
 798 LJFOLDF(kfold_conv_kint_i64)
 799 {
 800   if ((fins->op2 & IRCONV_SEXT))
 801     return INT64FOLD((uint64_t)(int64_t)fleft->i);
 802   else
 803     return INT64FOLD((uint64_t)(int64_t)(uint32_t)fleft->i);
 804 }
 805 
 806 LJFOLD(CONV KINT64 IRCONV_NUM_I64)
 807 LJFOLDF(kfold_conv_kint64_num_i64)
 808 {
 809   return lj_ir_knum(J, (lua_Number)(int64_t)ir_kint64(fleft)->u64);
 810 }
 811 
 812 LJFOLD(CONV KINT64 IRCONV_NUM_U64)
 813 LJFOLDF(kfold_conv_kint64_num_u64)
 814 {
 815   return lj_ir_knum(J, (lua_Number)ir_kint64(fleft)->u64);
 816 }
 817 
 818 LJFOLD(CONV KINT64 IRCONV_INT_I64)
 819 LJFOLD(CONV KINT64 IRCONV_U32_I64)
 820 LJFOLDF(kfold_conv_kint64_int_i64)
 821 {
 822   return INTFOLD((int32_t)ir_kint64(fleft)->u64);
 823 }
 824 
 825 LJFOLD(CONV KNUM IRCONV_INT_NUM)
 826 LJFOLDF(kfold_conv_knum_int_num)
 827 {
 828   lua_Number n = knumleft;
 829   int32_t k = lj_num2int(n);
 830   if (irt_isguard(fins->t) && n != (lua_Number)k) {
 831     /* We're about to create a guard which always fails, like CONV +1.5.
 832     ** Some pathological loops cause this during LICM, e.g.:
 833     **   local x,k,t = 0,1.5,{1,[1.5]=2}
 834     **   for i=1,200 do x = x+ t[k]; k = k == 1 and 1.5 or 1 end
 835     **   assert(x == 300)
 836     */
 837     return FAILFOLD;
 838   }
 839   return INTFOLD(k);
 840 }
 841 
 842 LJFOLD(CONV KNUM IRCONV_U32_NUM)
 843 LJFOLDF(kfold_conv_knum_u32_num)
 844 {
 845 #ifdef _MSC_VER
 846   {  /* Workaround for MSVC bug. */
 847     volatile uint32_t u = (uint32_t)knumleft;
 848     return INTFOLD((int32_t)u);
 849   }
 850 #else
 851   return INTFOLD((int32_t)(uint32_t)knumleft);
 852 #endif
 853 }
 854 
 855 LJFOLD(CONV KNUM IRCONV_I64_NUM)
 856 LJFOLDF(kfold_conv_knum_i64_num)
 857 {
 858   return INT64FOLD((uint64_t)(int64_t)knumleft);
 859 }
 860 
 861 LJFOLD(CONV KNUM IRCONV_U64_NUM)
 862 LJFOLDF(kfold_conv_knum_u64_num)
 863 {
 864   return INT64FOLD(lj_num2u64(knumleft));
 865 }
 866 
 867 LJFOLD(TOSTR KNUM any)
 868 LJFOLDF(kfold_tostr_knum)
 869 {
 870   return lj_ir_kstr(J, lj_strfmt_num(J->L, ir_knum(fleft)));
 871 }
 872 
 873 LJFOLD(TOSTR KINT any)
 874 LJFOLDF(kfold_tostr_kint)
 875 {
 876   return lj_ir_kstr(J, fins->op2 == IRTOSTR_INT ?
 877                        lj_strfmt_int(J->L, fleft->i) :
 878                        lj_strfmt_char(J->L, fleft->i));
 879 }
 880 
 881 LJFOLD(STRTO KGC)
 882 LJFOLDF(kfold_strto)
 883 {
 884   TValue n;
 885   if (lj_strscan_num(ir_kstr(fleft), &n))
 886     return lj_ir_knum(J, numV(&n));
 887   return FAILFOLD;
 888 }
 889 
 890 /* -- Constant folding of equality checks --------------------------------- */
 891 
 892 /* Don't constant-fold away FLOAD checks against KNULL. */
 893 LJFOLD(EQ FLOAD KNULL)
 894 LJFOLD(NE FLOAD KNULL)
 895 LJFOLDX(lj_opt_cse)
 896 
 897 /* But fold all other KNULL compares, since only KNULL is equal to KNULL. */
 898 LJFOLD(EQ any KNULL)
 899 LJFOLD(NE any KNULL)
 900 LJFOLD(EQ KNULL any)
 901 LJFOLD(NE KNULL any)
 902 LJFOLD(EQ KINT KINT)  /* Constants are unique, so same refs <==> same value. */
 903 LJFOLD(NE KINT KINT)
 904 LJFOLD(EQ KINT64 KINT64)
 905 LJFOLD(NE KINT64 KINT64)
 906 LJFOLD(EQ KGC KGC)
 907 LJFOLD(NE KGC KGC)
 908 LJFOLDF(kfold_kref)
 909 {
 910   return CONDFOLD((fins->op1 == fins->op2) ^ (fins->o == IR_NE));
 911 }
 912 
 913 /* -- Algebraic shortcuts ------------------------------------------------- */
 914 
 915 LJFOLD(FPMATH FPMATH IRFPM_FLOOR)
 916 LJFOLD(FPMATH FPMATH IRFPM_CEIL)
 917 LJFOLD(FPMATH FPMATH IRFPM_TRUNC)
 918 LJFOLDF(shortcut_round)
 919 {
 920   IRFPMathOp op = (IRFPMathOp)fleft->op2;
 921   if (op == IRFPM_FLOOR || op == IRFPM_CEIL || op == IRFPM_TRUNC)
 922     return LEFTFOLD;  /* round(round_left(x)) = round_left(x) */
 923   return NEXTFOLD;
 924 }
 925 
 926 LJFOLD(ABS ABS FLOAD)
 927 LJFOLDF(shortcut_left)
 928 {
 929   return LEFTFOLD;  /* f(g(x)) ==> g(x) */
 930 }
 931 
 932 LJFOLD(ABS NEG FLOAD)
 933 LJFOLDF(shortcut_dropleft)
 934 {
 935   PHIBARRIER(fleft);
 936   fins->op1 = fleft->op1;  /* abs(neg(x)) ==> abs(x) */
 937   return RETRYFOLD;
 938 }
 939 
 940 /* Note: no safe shortcuts with STRTO and TOSTR ("1e2" ==> +100 ==> "100"). */
 941 LJFOLD(NEG NEG any)
 942 LJFOLD(BNOT BNOT)
 943 LJFOLD(BSWAP BSWAP)
 944 LJFOLDF(shortcut_leftleft)
 945 {
 946   PHIBARRIER(fleft);  /* See above. Fold would be ok, but not beneficial. */
 947   return fleft->op1;  /* f(g(x)) ==> x */
 948 }
 949 
 950 /* -- FP algebraic simplifications ---------------------------------------- */
 951 
 952 /* FP arithmetic is tricky -- there's not much to simplify.
 953 ** Please note the following common pitfalls before sending "improvements":
 954 **   x+0 ==> x  is INVALID for x=-0
 955 **   0-x ==> -x is INVALID for x=+0
 956 **   x*0 ==> 0  is INVALID for x=-0, x=+-Inf or x=NaN
 957 */
 958 
 959 LJFOLD(ADD NEG any)
 960 LJFOLDF(simplify_numadd_negx)
 961 {
 962   PHIBARRIER(fleft);
 963   fins->o = IR_SUB;  /* (-a) + b ==> b - a */
 964   fins->op1 = fins->op2;
 965   fins->op2 = fleft->op1;
 966   return RETRYFOLD;
 967 }
 968 
 969 LJFOLD(ADD any NEG)
 970 LJFOLDF(simplify_numadd_xneg)
 971 {
 972   PHIBARRIER(fright);
 973   fins->o = IR_SUB;  /* a + (-b) ==> a - b */
 974   fins->op2 = fright->op1;
 975   return RETRYFOLD;
 976 }
 977 
 978 LJFOLD(SUB any KNUM)
 979 LJFOLDF(simplify_numsub_k)
 980 {
 981   lua_Number n = knumright;
 982   if (n == 0.0)  /* x - (+-0) ==> x */
 983     return LEFTFOLD;
 984   return NEXTFOLD;
 985 }
 986 
 987 LJFOLD(SUB NEG KNUM)
 988 LJFOLDF(simplify_numsub_negk)
 989 {
 990   PHIBARRIER(fleft);
 991   fins->op2 = fleft->op1;  /* (-x) - k ==> (-k) - x */
 992   fins->op1 = (IRRef1)lj_ir_knum(J, -knumright);
 993   return RETRYFOLD;
 994 }
 995 
 996 LJFOLD(SUB any NEG)
 997 LJFOLDF(simplify_numsub_xneg)
 998 {
 999   PHIBARRIER(fright);
1000   fins->o = IR_ADD;  /* a - (-b) ==> a + b */
1001   fins->op2 = fright->op1;
1002   return RETRYFOLD;
1003 }
1004 
1005 LJFOLD(MUL any KNUM)
1006 LJFOLD(DIV any KNUM)
1007 LJFOLDF(simplify_nummuldiv_k)
1008 {
1009   lua_Number n = knumright;
1010   if (n == 1.0) {  /* x o 1 ==> x */
1011     return LEFTFOLD;
1012   } else if (n == -1.0) {  /* x o -1 ==> -x */
1013     IRRef op1 = fins->op1;
1014     fins->op2 = (IRRef1)lj_ir_ksimd(J, LJ_KSIMD_NEG);  /* Modifies fins. */
1015     fins->op1 = op1;
1016     fins->o = IR_NEG;
1017     return RETRYFOLD;
1018   } else if (fins->o == IR_MUL && n == 2.0) {  /* x * 2 ==> x + x */
1019     fins->o = IR_ADD;
1020     fins->op2 = fins->op1;
1021     return RETRYFOLD;
1022   } else if (fins->o == IR_DIV) {  /* x / 2^k ==> x * 2^-k */
1023     uint64_t u = ir_knum(fright)->u64;
1024     uint32_t ex = ((uint32_t)(u >> 52) & 0x7ff);
1025     if ((u & U64x(000fffff,ffffffff)) == 0 && ex - 1 < 0x7fd) {
1026       u = (u & ((uint64_t)1 << 63)) | ((uint64_t)(0x7fe - ex) << 52);
1027       fins->o = IR_MUL;  /* Multiply by exact reciprocal. */
1028       fins->op2 = lj_ir_knum_u64(J, u);
1029       return RETRYFOLD;
1030     }
1031   }
1032   return NEXTFOLD;
1033 }
1034 
1035 LJFOLD(MUL NEG KNUM)
1036 LJFOLD(DIV NEG KNUM)
1037 LJFOLDF(simplify_nummuldiv_negk)
1038 {
1039   PHIBARRIER(fleft);
1040   fins->op1 = fleft->op1;  /* (-a) o k ==> a o (-k) */
1041   fins->op2 = (IRRef1)lj_ir_knum(J, -knumright);
1042   return RETRYFOLD;
1043 }
1044 
1045 LJFOLD(MUL NEG NEG)
1046 LJFOLD(DIV NEG NEG)
1047 LJFOLDF(simplify_nummuldiv_negneg)
1048 {
1049   PHIBARRIER(fleft);
1050   PHIBARRIER(fright);
1051   fins->op1 = fleft->op1;  /* (-a) o (-b) ==> a o b */
1052   fins->op2 = fright->op1;
1053   return RETRYFOLD;
1054 }
1055 
1056 LJFOLD(POW any KINT)
1057 LJFOLDF(simplify_numpow_xk)
1058 {
1059   int32_t k = fright->i;
1060   TRef ref = fins->op1;
1061   if (k == 0)  /* x ^ 0 ==> 1 */
1062     return lj_ir_knum_one(J);  /* Result must be a number, not an int. */
1063   if (k == 1)  /* x ^ 1 ==> x */
1064     return LEFTFOLD;
1065   if ((uint32_t)(k+65536) > 2*65536u)  /* Limit code explosion. */
1066     return NEXTFOLD;
1067   if (k < 0) {  /* x ^ (-k) ==> (1/x) ^ k. */
1068     ref = emitir(IRTN(IR_DIV), lj_ir_knum_one(J), ref);
1069     k = -k;
1070   }
1071   /* Unroll x^k for 1 <= k <= 65536. */
1072   for (; (k & 1) == 0; k >>= 1)  /* Handle leading zeros. */
1073     ref = emitir(IRTN(IR_MUL), ref, ref);
1074   if ((k >>= 1) != 0) {  /* Handle trailing bits. */
1075     TRef tmp = emitir(IRTN(IR_MUL), ref, ref);
1076     for (; k != 1; k >>= 1) {
1077       if (k & 1)
1078         ref = emitir(IRTN(IR_MUL), ref, tmp);
1079       tmp = emitir(IRTN(IR_MUL), tmp, tmp);
1080     }
1081     ref = emitir(IRTN(IR_MUL), ref, tmp);
1082   }
1083   return ref;
1084 }
1085 
1086 LJFOLD(POW KNUM any)
1087 LJFOLDF(simplify_numpow_kx)
1088 {
1089   lua_Number n = knumleft;
1090   if (n == 2.0) {  /* 2.0 ^ i ==> ldexp(1.0, tonum(i)) */
1091     fins->o = IR_CONV;
1092 #if LJ_TARGET_X86ORX64
1093     fins->op1 = fins->op2;
1094     fins->op2 = IRCONV_NUM_INT;
1095     fins->op2 = (IRRef1)lj_opt_fold(J);
1096 #endif
1097     fins->op1 = (IRRef1)lj_ir_knum_one(J);
1098     fins->o = IR_LDEXP;
1099     return RETRYFOLD;
1100   }
1101   return NEXTFOLD;
1102 }
1103 
1104 /* -- Simplify conversions ------------------------------------------------ */
1105 
1106 LJFOLD(CONV CONV IRCONV_NUM_INT)  /* _NUM */
1107 LJFOLDF(shortcut_conv_num_int)
1108 {
1109   PHIBARRIER(fleft);
1110   /* Only safe with a guarded conversion to int. */
1111   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_NUM && irt_isguard(fleft->t))
1112     return fleft->op1;  /* f(g(x)) ==> x */
1113   return NEXTFOLD;
1114 }
1115 
1116 LJFOLD(CONV CONV IRCONV_INT_NUM)  /* _INT */
1117 LJFOLD(CONV CONV IRCONV_U32_NUM)  /* _U32*/
1118 LJFOLDF(simplify_conv_int_num)
1119 {
1120   /* Fold even across PHI to avoid expensive num->int conversions in loop. */
1121   if ((fleft->op2 & IRCONV_SRCMASK) ==
1122       ((fins->op2 & IRCONV_DSTMASK) >> IRCONV_DSH))
1123     return fleft->op1;
1124   return NEXTFOLD;
1125 }
1126 
1127 LJFOLD(CONV CONV IRCONV_I64_NUM)  /* _INT or _U32 */
1128 LJFOLD(CONV CONV IRCONV_U64_NUM)  /* _INT or _U32 */
1129 LJFOLDF(simplify_conv_i64_num)
1130 {
1131   PHIBARRIER(fleft);
1132   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT) {
1133     /* Reduce to a sign-extension. */
1134     fins->op1 = fleft->op1;
1135     fins->op2 = ((IRT_I64<<5)|IRT_INT|IRCONV_SEXT);
1136     return RETRYFOLD;
1137   } else if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
1138 #if LJ_TARGET_X64
1139     return fleft->op1;
1140 #else
1141     /* Reduce to a zero-extension. */
1142     fins->op1 = fleft->op1;
1143     fins->op2 = (IRT_I64<<5)|IRT_U32;
1144     return RETRYFOLD;
1145 #endif
1146   }
1147   return NEXTFOLD;
1148 }
1149 
1150 LJFOLD(CONV CONV IRCONV_INT_I64)  /* _INT or _U32 */
1151 LJFOLD(CONV CONV IRCONV_INT_U64)  /* _INT or _U32 */
1152 LJFOLD(CONV CONV IRCONV_U32_I64)  /* _INT or _U32 */
1153 LJFOLD(CONV CONV IRCONV_U32_U64)  /* _INT or _U32 */
1154 LJFOLDF(simplify_conv_int_i64)
1155 {
1156   int src;
1157   PHIBARRIER(fleft);
1158   src = (fleft->op2 & IRCONV_SRCMASK);
1159   if (src == IRT_INT || src == IRT_U32) {
1160     if (src == ((fins->op2 & IRCONV_DSTMASK) >> IRCONV_DSH)) {
1161       return fleft->op1;
1162     } else {
1163       fins->op2 = ((fins->op2 & IRCONV_DSTMASK) | src);
1164       fins->op1 = fleft->op1;
1165       return RETRYFOLD;
1166     }
1167   }
1168   return NEXTFOLD;
1169 }
1170 
1171 LJFOLD(CONV CONV IRCONV_FLOAT_NUM)  /* _FLOAT */
1172 LJFOLDF(simplify_conv_flt_num)
1173 {
1174   PHIBARRIER(fleft);
1175   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_FLOAT)
1176     return fleft->op1;
1177   return NEXTFOLD;
1178 }
1179 
1180 /* Shortcut TOBIT + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
1181 LJFOLD(TOBIT CONV KNUM)
1182 LJFOLDF(simplify_tobit_conv)
1183 {
1184   /* Fold even across PHI to avoid expensive num->int conversions in loop. */
1185   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT) {
1186     lua_assert(irt_isnum(fleft->t));
1187     return fleft->op1;
1188   } else if ((fleft->op2 & IRCONV_SRCMASK) == IRT_U32) {
1189     lua_assert(irt_isnum(fleft->t));
1190     fins->o = IR_CONV;
1191     fins->op1 = fleft->op1;
1192     fins->op2 = (IRT_INT<<5)|IRT_U32;
1193     return RETRYFOLD;
1194   }
1195   return NEXTFOLD;
1196 }
1197 
1198 /* Shortcut floor/ceil/round + IRT_NUM <- IRT_INT/IRT_U32 conversion. */
1199 LJFOLD(FPMATH CONV IRFPM_FLOOR)
1200 LJFOLD(FPMATH CONV IRFPM_CEIL)
1201 LJFOLD(FPMATH CONV IRFPM_TRUNC)
1202 LJFOLDF(simplify_floor_conv)
1203 {
1204   if ((fleft->op2 & IRCONV_SRCMASK) == IRT_INT ||
1205       (fleft->op2 & IRCONV_SRCMASK) == IRT_U32)
1206     return LEFTFOLD;
1207   return NEXTFOLD;
1208 }
1209 
1210 /* Strength reduction of widening. */
1211 LJFOLD(CONV any IRCONV_I64_INT)
1212 LJFOLD(CONV any IRCONV_U64_INT)
1213 LJFOLDF(simplify_conv_sext)
1214 {
1215   IRRef ref = fins->op1;
1216   int64_t ofs = 0;
1217   if (!(fins->op2 & IRCONV_SEXT))
1218     return NEXTFOLD;
1219   PHIBARRIER(fleft);
1220   if (fleft->o == IR_XLOAD && (irt_isu8(fleft->t) || irt_isu16(fleft->t)))
1221     goto ok_reduce;
1222   if (fleft->o == IR_ADD && irref_isk(fleft->op2)) {
1223     ofs = (int64_t)IR(fleft->op2)->i;
1224     ref = fleft->op1;
1225   }
1226   /* Use scalar evolution analysis results to strength-reduce sign-extension. */
1227   if (ref == J->scev.idx) {
1228     IRRef lo = J->scev.dir ? J->scev.start : J->scev.stop;
1229     lua_assert(irt_isint(J->scev.t));
1230     if (lo && IR(lo)->o == IR_KINT && IR(lo)->i + ofs >= 0) {
1231     ok_reduce:
1232 #if LJ_TARGET_X64
1233       /* Eliminate widening. All 32 bit ops do an implicit zero-extension. */
1234       return LEFTFOLD;
1235 #else
1236       /* Reduce to a (cheaper) zero-extension. */
1237       fins->op2 &= ~IRCONV_SEXT;
1238       return RETRYFOLD;
1239 #endif
1240     }
1241   }
1242   return NEXTFOLD;
1243 }
1244 
1245 /* Strength reduction of narrowing. */
1246 LJFOLD(CONV ADD IRCONV_INT_I64)
1247 LJFOLD(CONV SUB IRCONV_INT_I64)
1248 LJFOLD(CONV MUL IRCONV_INT_I64)
1249 LJFOLD(CONV ADD IRCONV_INT_U64)
1250 LJFOLD(CONV SUB IRCONV_INT_U64)
1251 LJFOLD(CONV MUL IRCONV_INT_U64)
1252 LJFOLD(CONV ADD IRCONV_U32_I64)
1253 LJFOLD(CONV SUB IRCONV_U32_I64)
1254 LJFOLD(CONV MUL IRCONV_U32_I64)
1255 LJFOLD(CONV ADD IRCONV_U32_U64)
1256 LJFOLD(CONV SUB IRCONV_U32_U64)
1257 LJFOLD(CONV MUL IRCONV_U32_U64)
1258 LJFOLDF(simplify_conv_narrow)
1259 {
1260   IROp op = (IROp)fleft->o;
1261   IRType t = irt_type(fins->t);
1262   IRRef op1 = fleft->op1, op2 = fleft->op2, mode = fins->op2;
1263   PHIBARRIER(fleft);
1264   op1 = emitir(IRTI(IR_CONV), op1, mode);
1265   op2 = emitir(IRTI(IR_CONV), op2, mode);
1266   fins->ot = IRT(op, t);
1267   fins->op1 = op1;
1268   fins->op2 = op2;
1269   return RETRYFOLD;
1270 }
1271 
1272 /* Special CSE rule for CONV. */
1273 LJFOLD(CONV any any)
1274 LJFOLDF(cse_conv)
1275 {
1276   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
1277     IRRef op1 = fins->op1, op2 = (fins->op2 & IRCONV_MODEMASK);
1278     uint8_t guard = irt_isguard(fins->t);
1279     IRRef ref = J->chain[IR_CONV];
1280     while (ref > op1) {
1281       IRIns *ir = IR(ref);
1282       /* Commoning with stronger checks is ok. */
1283       if (ir->op1 == op1 && (ir->op2 & IRCONV_MODEMASK) == op2 &&
1284           irt_isguard(ir->t) >= guard)
1285         return ref;
1286       ref = ir->prev;
1287     }
1288   }
1289   return EMITFOLD;  /* No fallthrough to regular CSE. */
1290 }
1291 
1292 /* FP conversion narrowing. */
1293 LJFOLD(TOBIT ADD KNUM)
1294 LJFOLD(TOBIT SUB KNUM)
1295 LJFOLD(CONV ADD IRCONV_INT_NUM)
1296 LJFOLD(CONV SUB IRCONV_INT_NUM)
1297 LJFOLD(CONV ADD IRCONV_I64_NUM)
1298 LJFOLD(CONV SUB IRCONV_I64_NUM)
1299 LJFOLDF(narrow_convert)
1300 {
1301   PHIBARRIER(fleft);
1302   /* Narrowing ignores PHIs and repeating it inside the loop is not useful. */
1303   if (J->chain[IR_LOOP])
1304     return NEXTFOLD;
1305   lua_assert(fins->o != IR_CONV || (fins->op2&IRCONV_CONVMASK) != IRCONV_TOBIT);
1306   return lj_opt_narrow_convert(J);
1307 }
1308 
1309 /* -- Integer algebraic simplifications ----------------------------------- */
1310 
1311 LJFOLD(ADD any KINT)
1312 LJFOLD(ADDOV any KINT)
1313 LJFOLD(SUBOV any KINT)
1314 LJFOLDF(simplify_intadd_k)
1315 {
1316   if (fright->i == 0)  /* i o 0 ==> i */
1317     return LEFTFOLD;
1318   return NEXTFOLD;
1319 }
1320 
1321 LJFOLD(MULOV any KINT)
1322 LJFOLDF(simplify_intmul_k)
1323 {
1324   if (fright->i == 0)  /* i * 0 ==> 0 */
1325     return RIGHTFOLD;
1326   if (fright->i == 1)  /* i * 1 ==> i */
1327     return LEFTFOLD;
1328   if (fright->i == 2) {  /* i * 2 ==> i + i */
1329     fins->o = IR_ADDOV;
1330     fins->op2 = fins->op1;
1331     return RETRYFOLD;
1332   }
1333   return NEXTFOLD;
1334 }
1335 
1336 LJFOLD(SUB any KINT)
1337 LJFOLDF(simplify_intsub_k)
1338 {
1339   if (fright->i == 0)  /* i - 0 ==> i */
1340     return LEFTFOLD;
1341   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
1342   fins->op2 = (IRRef1)lj_ir_kint(J, -fright->i);  /* Overflow for -2^31 ok. */
1343   return RETRYFOLD;
1344 }
1345 
1346 LJFOLD(SUB KINT any)
1347 LJFOLD(SUB KINT64 any)
1348 LJFOLDF(simplify_intsub_kleft)
1349 {
1350   if (fleft->o == IR_KINT ? (fleft->i == 0) : (ir_kint64(fleft)->u64 == 0)) {
1351     fins->o = IR_NEG;  /* 0 - i ==> -i */
1352     fins->op1 = fins->op2;
1353     return RETRYFOLD;
1354   }
1355   return NEXTFOLD;
1356 }
1357 
1358 LJFOLD(ADD any KINT64)
1359 LJFOLDF(simplify_intadd_k64)
1360 {
1361   if (ir_kint64(fright)->u64 == 0)  /* i + 0 ==> i */
1362     return LEFTFOLD;
1363   return NEXTFOLD;
1364 }
1365 
1366 LJFOLD(SUB any KINT64)
1367 LJFOLDF(simplify_intsub_k64)
1368 {
1369   uint64_t k = ir_kint64(fright)->u64;
1370   if (k == 0)  /* i - 0 ==> i */
1371     return LEFTFOLD;
1372   fins->o = IR_ADD;  /* i - k ==> i + (-k) */
1373   fins->op2 = (IRRef1)lj_ir_kint64(J, (uint64_t)-(int64_t)k);
1374   return RETRYFOLD;
1375 }
1376 
1377 static TRef simplify_intmul_k(jit_State *J, int32_t k)
1378 {
1379   /* Note: many more simplifications are possible, e.g. 2^k1 +- 2^k2.
1380   ** But this is mainly intended for simple address arithmetic.
1381   ** Also it's easier for the backend to optimize the original multiplies.
1382   */
1383   if (k == 0) {  /* i * 0 ==> 0 */
1384     return RIGHTFOLD;
1385   } else if (k == 1) {  /* i * 1 ==> i */
1386     return LEFTFOLD;
1387   } else if ((k & (k-1)) == 0) {  /* i * 2^k ==> i << k */
1388     fins->o = IR_BSHL;
1389     fins->op2 = lj_ir_kint(J, lj_fls((uint32_t)k));
1390     return RETRYFOLD;
1391   }
1392   return NEXTFOLD;
1393 }
1394 
1395 LJFOLD(MUL any KINT)
1396 LJFOLDF(simplify_intmul_k32)
1397 {
1398   if (fright->i >= 0)
1399     return simplify_intmul_k(J, fright->i);
1400   return NEXTFOLD;
1401 }
1402 
1403 LJFOLD(MUL any KINT64)
1404 LJFOLDF(simplify_intmul_k64)
1405 {
1406 #if LJ_HASFFI
1407   if (ir_kint64(fright)->u64 < 0x80000000u)
1408     return simplify_intmul_k(J, (int32_t)ir_kint64(fright)->u64);
1409   return NEXTFOLD;
1410 #else
1411   UNUSED(J); lua_assert(0); return FAILFOLD;
1412 #endif
1413 }
1414 
1415 LJFOLD(MOD any KINT)
1416 LJFOLDF(simplify_intmod_k)
1417 {
1418   int32_t k = fright->i;
1419   lua_assert(k != 0);
1420   if (k > 0 && (k & (k-1)) == 0) {  /* i % (2^k) ==> i & (2^k-1) */
1421     fins->o = IR_BAND;
1422     fins->op2 = lj_ir_kint(J, k-1);
1423     return RETRYFOLD;
1424   }
1425   return NEXTFOLD;
1426 }
1427 
1428 LJFOLD(MOD KINT any)
1429 LJFOLDF(simplify_intmod_kleft)
1430 {
1431   if (fleft->i == 0)
1432     return INTFOLD(0);
1433   return NEXTFOLD;
1434 }
1435 
1436 LJFOLD(SUB any any)
1437 LJFOLD(SUBOV any any)
1438 LJFOLDF(simplify_intsub)
1439 {
1440   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))  /* i - i ==> 0 */
1441     return irt_is64(fins->t) ? INT64FOLD(0) : INTFOLD(0);
1442   return NEXTFOLD;
1443 }
1444 
1445 LJFOLD(SUB ADD any)
1446 LJFOLDF(simplify_intsubadd_leftcancel)
1447 {
1448   if (!irt_isnum(fins->t)) {
1449     PHIBARRIER(fleft);
1450     if (fins->op2 == fleft->op1)  /* (i + j) - i ==> j */
1451       return fleft->op2;
1452     if (fins->op2 == fleft->op2)  /* (i + j) - j ==> i */
1453       return fleft->op1;
1454   }
1455   return NEXTFOLD;
1456 }
1457 
1458 LJFOLD(SUB SUB any)
1459 LJFOLDF(simplify_intsubsub_leftcancel)
1460 {
1461   if (!irt_isnum(fins->t)) {
1462     PHIBARRIER(fleft);
1463     if (fins->op2 == fleft->op1) {  /* (i - j) - i ==> 0 - j */
1464       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
1465       fins->op2 = fleft->op2;
1466       return RETRYFOLD;
1467     }
1468   }
1469   return NEXTFOLD;
1470 }
1471 
1472 LJFOLD(SUB any SUB)
1473 LJFOLDF(simplify_intsubsub_rightcancel)
1474 {
1475   if (!irt_isnum(fins->t)) {
1476     PHIBARRIER(fright);
1477     if (fins->op1 == fright->op1)  /* i - (i - j) ==> j */
1478       return fright->op2;
1479   }
1480   return NEXTFOLD;
1481 }
1482 
1483 LJFOLD(SUB any ADD)
1484 LJFOLDF(simplify_intsubadd_rightcancel)
1485 {
1486   if (!irt_isnum(fins->t)) {
1487     PHIBARRIER(fright);
1488     if (fins->op1 == fright->op1) {  /* i - (i + j) ==> 0 - j */
1489       fins->op2 = fright->op2;
1490       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
1491       return RETRYFOLD;
1492     }
1493     if (fins->op1 == fright->op2) {  /* i - (j + i) ==> 0 - j */
1494       fins->op2 = fright->op1;
1495       fins->op1 = (IRRef1)lj_ir_kint(J, 0);
1496       return RETRYFOLD;
1497     }
1498   }
1499   return NEXTFOLD;
1500 }
1501 
1502 LJFOLD(SUB ADD ADD)
1503 LJFOLDF(simplify_intsubaddadd_cancel)
1504 {
1505   if (!irt_isnum(fins->t)) {
1506     PHIBARRIER(fleft);
1507     PHIBARRIER(fright);
1508     if (fleft->op1 == fright->op1) {  /* (i + j1) - (i + j2) ==> j1 - j2 */
1509       fins->op1 = fleft->op2;
1510       fins->op2 = fright->op2;
1511       return RETRYFOLD;
1512     }
1513     if (fleft->op1 == fright->op2) {  /* (i + j1) - (j2 + i) ==> j1 - j2 */
1514       fins->op1 = fleft->op2;
1515       fins->op2 = fright->op1;
1516       return RETRYFOLD;
1517     }
1518     if (fleft->op2 == fright->op1) {  /* (j1 + i) - (i + j2) ==> j1 - j2 */
1519       fins->op1 = fleft->op1;
1520       fins->op2 = fright->op2;
1521       return RETRYFOLD;
1522     }
1523     if (fleft->op2 == fright->op2) {  /* (j1 + i) - (j2 + i) ==> j1 - j2 */
1524       fins->op1 = fleft->op1;
1525       fins->op2 = fright->op1;
1526       return RETRYFOLD;
1527     }
1528   }
1529   return NEXTFOLD;
1530 }
1531 
1532 LJFOLD(BAND any KINT)
1533 LJFOLD(BAND any KINT64)
1534 LJFOLDF(simplify_band_k)
1535 {
1536   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
1537                                      (int64_t)ir_k64(fright)->u64;
1538   if (k == 0)  /* i & 0 ==> 0 */
1539     return RIGHTFOLD;
1540   if (k == -1)  /* i & -1 ==> i */
1541     return LEFTFOLD;
1542   return NEXTFOLD;
1543 }
1544 
1545 LJFOLD(BOR any KINT)
1546 LJFOLD(BOR any KINT64)
1547 LJFOLDF(simplify_bor_k)
1548 {
1549   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
1550                                      (int64_t)ir_k64(fright)->u64;
1551   if (k == 0)  /* i | 0 ==> i */
1552     return LEFTFOLD;
1553   if (k == -1)  /* i | -1 ==> -1 */
1554     return RIGHTFOLD;
1555   return NEXTFOLD;
1556 }
1557 
1558 LJFOLD(BXOR any KINT)
1559 LJFOLD(BXOR any KINT64)
1560 LJFOLDF(simplify_bxor_k)
1561 {
1562   int64_t k = fright->o == IR_KINT ? (int64_t)fright->i :
1563                                      (int64_t)ir_k64(fright)->u64;
1564   if (k == 0)  /* i xor 0 ==> i */
1565     return LEFTFOLD;
1566   if (k == -1) {  /* i xor -1 ==> ~i */
1567     fins->o = IR_BNOT;
1568     fins->op2 = 0;
1569     return RETRYFOLD;
1570   }
1571   return NEXTFOLD;
1572 }
1573 
1574 LJFOLD(BSHL any KINT)
1575 LJFOLD(BSHR any KINT)
1576 LJFOLD(BSAR any KINT)
1577 LJFOLD(BROL any KINT)
1578 LJFOLD(BROR any KINT)
1579 LJFOLDF(simplify_shift_ik)
1580 {
1581   int32_t mask = irt_is64(fins->t) ? 63 : 31;
1582   int32_t k = (fright->i & mask);
1583   if (k == 0)  /* i o 0 ==> i */
1584     return LEFTFOLD;
1585   if (k == 1 && fins->o == IR_BSHL) {  /* i << 1 ==> i + i */
1586     fins->o = IR_ADD;
1587     fins->op2 = fins->op1;
1588     return RETRYFOLD;
1589   }
1590   if (k != fright->i) {  /* i o k ==> i o (k & mask) */
1591     fins->op2 = (IRRef1)lj_ir_kint(J, k);
1592     return RETRYFOLD;
1593   }
1594 #ifndef LJ_TARGET_UNIFYROT
1595   if (fins->o == IR_BROR) {  /* bror(i, k) ==> brol(i, (-k)&mask) */
1596     fins->o = IR_BROL;
1597     fins->op2 = (IRRef1)lj_ir_kint(J, (-k)&mask);
1598     return RETRYFOLD;
1599   }
1600 #endif
1601   return NEXTFOLD;
1602 }
1603 
1604 LJFOLD(BSHL any BAND)
1605 LJFOLD(BSHR any BAND)
1606 LJFOLD(BSAR any BAND)
1607 LJFOLD(BROL any BAND)
1608 LJFOLD(BROR any BAND)
1609 LJFOLDF(simplify_shift_andk)
1610 {
1611   IRIns *irk = IR(fright->op2);
1612   PHIBARRIER(fright);
1613   if ((fins->o < IR_BROL ? LJ_TARGET_MASKSHIFT : LJ_TARGET_MASKROT) &&
1614       irk->o == IR_KINT) {  /* i o (j & mask) ==> i o j */
1615     int32_t mask = irt_is64(fins->t) ? 63 : 31;
1616     int32_t k = irk->i & mask;
1617     if (k == mask) {
1618       fins->op2 = fright->op1;
1619       return RETRYFOLD;
1620     }
1621   }
1622   return NEXTFOLD;
1623 }
1624 
1625 LJFOLD(BSHL KINT any)
1626 LJFOLD(BSHR KINT any)
1627 LJFOLD(BSHL KINT64 any)
1628 LJFOLD(BSHR KINT64 any)
1629 LJFOLDF(simplify_shift1_ki)
1630 {
1631   int64_t k = fleft->o == IR_KINT ? (int64_t)fleft->i :
1632                                     (int64_t)ir_k64(fleft)->u64;
1633   if (k == 0)  /* 0 o i ==> 0 */
1634     return LEFTFOLD;
1635   return NEXTFOLD;
1636 }
1637 
1638 LJFOLD(BSAR KINT any)
1639 LJFOLD(BROL KINT any)
1640 LJFOLD(BROR KINT any)
1641 LJFOLD(BSAR KINT64 any)
1642 LJFOLD(BROL KINT64 any)
1643 LJFOLD(BROR KINT64 any)
1644 LJFOLDF(simplify_shift2_ki)
1645 {
1646   int64_t k = fleft->o == IR_KINT ? (int64_t)fleft->i :
1647                                     (int64_t)ir_k64(fleft)->u64;
1648   if (k == 0 || k == -1)  /* 0 o i ==> 0; -1 o i ==> -1 */
1649     return LEFTFOLD;
1650   return NEXTFOLD;
1651 }
1652 
1653 LJFOLD(BSHL BAND KINT)
1654 LJFOLD(BSHR BAND KINT)
1655 LJFOLD(BROL BAND KINT)
1656 LJFOLD(BROR BAND KINT)
1657 LJFOLDF(simplify_shiftk_andk)
1658 {
1659   IRIns *irk = IR(fleft->op2);
1660   PHIBARRIER(fleft);
1661   if (irk->o == IR_KINT) {  /* (i & k1) o k2 ==> (i o k2) & (k1 o k2) */
1662     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
1663     fins->op1 = fleft->op1;
1664     fins->op1 = (IRRef1)lj_opt_fold(J);
1665     fins->op2 = (IRRef1)lj_ir_kint(J, k);
1666     fins->ot = IRTI(IR_BAND);
1667     return RETRYFOLD;
1668   } else if (irk->o == IR_KINT64) {
1669     uint64_t k = kfold_int64arith(ir_k64(irk)->u64, fright->i, (IROp)fins->o);
1670     IROpT ot = fleft->ot;
1671     fins->op1 = fleft->op1;
1672     fins->op1 = (IRRef1)lj_opt_fold(J);
1673     fins->op2 = (IRRef1)lj_ir_kint64(J, k);
1674     fins->ot = ot;
1675     return RETRYFOLD;
1676   }
1677   return NEXTFOLD;
1678 }
1679 
1680 LJFOLD(BAND BSHL KINT)
1681 LJFOLD(BAND BSHR KINT)
1682 LJFOLDF(simplify_andk_shiftk)
1683 {
1684   IRIns *irk = IR(fleft->op2);
1685   if (irk->o == IR_KINT &&
1686       kfold_intop(-1, irk->i, (IROp)fleft->o) == fright->i)
1687     return LEFTFOLD;  /* (i o k1) & k2 ==> i, if (-1 o k1) == k2 */
1688   return NEXTFOLD;
1689 }
1690 
1691 LJFOLD(BAND BOR KINT)
1692 LJFOLD(BOR BAND KINT)
1693 LJFOLDF(simplify_andor_k)
1694 {
1695   IRIns *irk = IR(fleft->op2);
1696   PHIBARRIER(fleft);
1697   if (irk->o == IR_KINT) {
1698     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
1699     /* (i | k1) & k2 ==> i & k2, if (k1 & k2) == 0. */
1700     /* (i & k1) | k2 ==> i | k2, if (k1 | k2) == -1. */
1701     if (k == (fins->o == IR_BAND ? 0 : -1)) {
1702       fins->op1 = fleft->op1;
1703       return RETRYFOLD;
1704     }
1705   }
1706   return NEXTFOLD;
1707 }
1708 
1709 LJFOLD(BAND BOR KINT64)
1710 LJFOLD(BOR BAND KINT64)
1711 LJFOLDF(simplify_andor_k64)
1712 {
1713 #if LJ_HASFFI
1714   IRIns *irk = IR(fleft->op2);
1715   PHIBARRIER(fleft);
1716   if (irk->o == IR_KINT64) {
1717     uint64_t k = kfold_int64arith(ir_k64(irk)->u64,
1718                                   ir_k64(fright)->u64, (IROp)fins->o);
1719     /* (i | k1) & k2 ==> i & k2, if (k1 & k2) == 0. */
1720     /* (i & k1) | k2 ==> i | k2, if (k1 | k2) == -1. */
1721     if (k == (fins->o == IR_BAND ? (uint64_t)0 : ~(uint64_t)0)) {
1722       fins->op1 = fleft->op1;
1723       return RETRYFOLD;
1724     }
1725   }
1726   return NEXTFOLD;
1727 #else
1728   UNUSED(J); lua_assert(0); return FAILFOLD;
1729 #endif
1730 }
1731 
1732 /* -- Reassociation ------------------------------------------------------- */
1733 
1734 LJFOLD(ADD ADD KINT)
1735 LJFOLD(MUL MUL KINT)
1736 LJFOLD(BAND BAND KINT)
1737 LJFOLD(BOR BOR KINT)
1738 LJFOLD(BXOR BXOR KINT)
1739 LJFOLDF(reassoc_intarith_k)
1740 {
1741   IRIns *irk = IR(fleft->op2);
1742   if (irk->o == IR_KINT) {
1743     int32_t k = kfold_intop(irk->i, fright->i, (IROp)fins->o);
1744     if (k == irk->i)  /* (i o k1) o k2 ==> i o k1, if (k1 o k2) == k1. */
1745       return LEFTFOLD;
1746     PHIBARRIER(fleft);
1747     fins->op1 = fleft->op1;
1748     fins->op2 = (IRRef1)lj_ir_kint(J, k);
1749     return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
1750   }
1751   return NEXTFOLD;
1752 }
1753 
1754 LJFOLD(ADD ADD KINT64)
1755 LJFOLD(MUL MUL KINT64)
1756 LJFOLD(BAND BAND KINT64)
1757 LJFOLD(BOR BOR KINT64)
1758 LJFOLD(BXOR BXOR KINT64)
1759 LJFOLDF(reassoc_intarith_k64)
1760 {
1761 #if LJ_HASFFI
1762   IRIns *irk = IR(fleft->op2);
1763   if (irk->o == IR_KINT64) {
1764     uint64_t k = kfold_int64arith(ir_k64(irk)->u64,
1765                                   ir_k64(fright)->u64, (IROp)fins->o);
1766     PHIBARRIER(fleft);
1767     fins->op1 = fleft->op1;
1768     fins->op2 = (IRRef1)lj_ir_kint64(J, k);
1769     return RETRYFOLD;  /* (i o k1) o k2 ==> i o (k1 o k2) */
1770   }
1771   return NEXTFOLD;
1772 #else
1773   UNUSED(J); lua_assert(0); return FAILFOLD;
1774 #endif
1775 }
1776 
1777 LJFOLD(MIN MIN any)
1778 LJFOLD(MAX MAX any)
1779 LJFOLD(BAND BAND any)
1780 LJFOLD(BOR BOR any)
1781 LJFOLDF(reassoc_dup)
1782 {
1783   if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
1784     return LEFTFOLD;  /* (a o b) o a ==> a o b; (a o b) o b ==> a o b */
1785   return NEXTFOLD;
1786 }
1787 
1788 LJFOLD(BXOR BXOR any)
1789 LJFOLDF(reassoc_bxor)
1790 {
1791   PHIBARRIER(fleft);
1792   if (fins->op2 == fleft->op1)  /* (a xor b) xor a ==> b */
1793     return fleft->op2;
1794   if (fins->op2 == fleft->op2)  /* (a xor b) xor b ==> a */
1795     return fleft->op1;
1796   return NEXTFOLD;
1797 }
1798 
1799 LJFOLD(BSHL BSHL KINT)
1800 LJFOLD(BSHR BSHR KINT)
1801 LJFOLD(BSAR BSAR KINT)
1802 LJFOLD(BROL BROL KINT)
1803 LJFOLD(BROR BROR KINT)
1804 LJFOLDF(reassoc_shift)
1805 {
1806   IRIns *irk = IR(fleft->op2);
1807   PHIBARRIER(fleft);  /* The (shift any KINT) rule covers k2 == 0 and more. */
1808   if (irk->o == IR_KINT) {  /* (i o k1) o k2 ==> i o (k1 + k2) */
1809     int32_t mask = irt_is64(fins->t) ? 63 : 31;
1810     int32_t k = (irk->i & mask) + (fright->i & mask);
1811     if (k > mask) {  /* Combined shift too wide? */
1812       if (fins->o == IR_BSHL || fins->o == IR_BSHR)
1813         return mask == 31 ? INTFOLD(0) : INT64FOLD(0);
1814       else if (fins->o == IR_BSAR)
1815         k = mask;
1816       else
1817         k &= mask;
1818     }
1819     fins->op1 = fleft->op1;
1820     fins->op2 = (IRRef1)lj_ir_kint(J, k);
1821     return RETRYFOLD;
1822   }
1823   return NEXTFOLD;
1824 }
1825 
1826 LJFOLD(MIN MIN KNUM)
1827 LJFOLD(MAX MAX KNUM)
1828 LJFOLD(MIN MIN KINT)
1829 LJFOLD(MAX MAX KINT)
1830 LJFOLDF(reassoc_minmax_k)
1831 {
1832   IRIns *irk = IR(fleft->op2);
1833   if (irk->o == IR_KNUM) {
1834     lua_Number a = ir_knum(irk)->n;
1835     lua_Number y = lj_vm_foldarith(a, knumright, fins->o - IR_ADD);
1836     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
1837       return LEFTFOLD;
1838     PHIBARRIER(fleft);
1839     fins->op1 = fleft->op1;
1840     fins->op2 = (IRRef1)lj_ir_knum(J, y);
1841     return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
1842   } else if (irk->o == IR_KINT) {
1843     int32_t a = irk->i;
1844     int32_t y = kfold_intop(a, fright->i, fins->o);
1845     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
1846       return LEFTFOLD;
1847     PHIBARRIER(fleft);
1848     fins->op1 = fleft->op1;
1849     fins->op2 = (IRRef1)lj_ir_kint(J, y);
1850     return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
1851   }
1852   return NEXTFOLD;
1853 }
1854 
1855 LJFOLD(MIN MAX any)
1856 LJFOLD(MAX MIN any)
1857 LJFOLDF(reassoc_minmax_left)
1858 {
1859   if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
1860     return RIGHTFOLD;  /* (b o1 a) o2 b ==> b; (a o1 b) o2 b ==> b */
1861   return NEXTFOLD;
1862 }
1863 
1864 LJFOLD(MIN any MAX)
1865 LJFOLD(MAX any MIN)
1866 LJFOLDF(reassoc_minmax_right)
1867 {
1868   if (fins->op1 == fright->op1 || fins->op1 == fright->op2)
1869     return LEFTFOLD;  /* a o2 (a o1 b) ==> a; a o2 (b o1 a) ==> a */
1870   return NEXTFOLD;
1871 }
1872 
1873 /* -- Array bounds check elimination -------------------------------------- */
1874 
1875 /* Eliminate ABC across PHIs to handle t[i-1] forwarding case.
1876 ** ABC(asize, (i+k)+(-k)) ==> ABC(asize, i), but only if it already exists.
1877 ** Could be generalized to (i+k1)+k2 ==> i+(k1+k2), but needs better disambig.
1878 */
1879 LJFOLD(ABC any ADD)
1880 LJFOLDF(abc_fwd)
1881 {
1882   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
1883     if (irref_isk(fright->op2)) {
1884       IRIns *add2 = IR(fright->op1);
1885       if (add2->o == IR_ADD && irref_isk(add2->op2) &&
1886           IR(fright->op2)->i == -IR(add2->op2)->i) {
1887         IRRef ref = J->chain[IR_ABC];
1888         IRRef lim = add2->op1;
1889         if (fins->op1 > lim) lim = fins->op1;
1890         while (ref > lim) {
1891           IRIns *ir = IR(ref);
1892           if (ir->op1 == fins->op1 && ir->op2 == add2->op1)
1893             return DROPFOLD;
1894           ref = ir->prev;
1895         }
1896       }
1897     }
1898   }
1899   return NEXTFOLD;
1900 }
1901 
1902 /* Eliminate ABC for constants.
1903 ** ABC(asize, k1), ABC(asize k2) ==> ABC(asize, max(k1, k2))
1904 ** Drop second ABC if k2 is lower. Otherwise patch first ABC with k2.
1905 */
1906 LJFOLD(ABC any KINT)
1907 LJFOLDF(abc_k)
1908 {
1909   if (LJ_LIKELY(J->flags & JIT_F_OPT_ABC)) {
1910     IRRef ref = J->chain[IR_ABC];
1911     IRRef asize = fins->op1;
1912     while (ref > asize) {
1913       IRIns *ir = IR(ref);
1914       if (ir->op1 == asize && irref_isk(ir->op2)) {
1915         int32_t k = IR(ir->op2)->i;
1916         if (fright->i > k)
1917           ir->op2 = fins->op2;
1918         return DROPFOLD;
1919       }
1920       ref = ir->prev;
1921     }
1922     return EMITFOLD;  /* Already performed CSE. */
1923   }
1924   return NEXTFOLD;
1925 }
1926 
1927 /* Eliminate invariant ABC inside loop. */
1928 LJFOLD(ABC any any)
1929 LJFOLDF(abc_invar)
1930 {
1931   /* Invariant ABC marked as PTR. Drop if op1 is invariant, too. */
1932   if (!irt_isint(fins->t) && fins->op1 < J->chain[IR_LOOP] &&
1933       !irt_isphi(IR(fins->op1)->t))
1934     return DROPFOLD;
1935   return NEXTFOLD;
1936 }
1937 
1938 /* -- Commutativity ------------------------------------------------------- */
1939 
1940 /* The refs of commutative ops are canonicalized. Lower refs go to the right.
1941 ** Rationale behind this:
1942 ** - It (also) moves constants to the right.
1943 ** - It reduces the number of FOLD rules (e.g. (BOR any KINT) suffices).
1944 ** - It helps CSE to find more matches.
1945 ** - The assembler generates better code with constants at the right.
1946 */
1947 
1948 LJFOLD(ADD any any)
1949 LJFOLD(MUL any any)
1950 LJFOLD(ADDOV any any)
1951 LJFOLD(MULOV any any)
1952 LJFOLDF(comm_swap)
1953 {
1954   if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
1955     IRRef1 tmp = fins->op1;
1956     fins->op1 = fins->op2;
1957     fins->op2 = tmp;
1958     return RETRYFOLD;
1959   }
1960   return NEXTFOLD;
1961 }
1962 
1963 LJFOLD(EQ any any)
1964 LJFOLD(NE any any)
1965 LJFOLDF(comm_equal)
1966 {
1967   /* For non-numbers only: x == x ==> drop; x ~= x ==> fail */
1968   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
1969     return CONDFOLD(fins->o == IR_EQ);
1970   return fold_comm_swap(J);
1971 }
1972 
1973 LJFOLD(LT any any)
1974 LJFOLD(GE any any)
1975 LJFOLD(LE any any)
1976 LJFOLD(GT any any)
1977 LJFOLD(ULT any any)
1978 LJFOLD(UGE any any)
1979 LJFOLD(ULE any any)
1980 LJFOLD(UGT any any)
1981 LJFOLDF(comm_comp)
1982 {
1983   /* For non-numbers only: x <=> x ==> drop; x <> x ==> fail */
1984   if (fins->op1 == fins->op2 && !irt_isnum(fins->t))
1985     return CONDFOLD((fins->o ^ (fins->o >> 1)) & 1);
1986   if (fins->op1 < fins->op2) {  /* Move lower ref to the right. */
1987     IRRef1 tmp = fins->op1;
1988     fins->op1 = fins->op2;
1989     fins->op2 = tmp;
1990     fins->o ^= 3; /* GT <-> LT, GE <-> LE, does not affect U */
1991     return RETRYFOLD;
1992   }
1993   return NEXTFOLD;
1994 }
1995 
1996 LJFOLD(BAND any any)
1997 LJFOLD(BOR any any)
1998 LJFOLD(MIN any any)
1999 LJFOLD(MAX any any)
2000 LJFOLDF(comm_dup)
2001 {
2002   if (fins->op1 == fins->op2)  /* x o x ==> x */
2003     return LEFTFOLD;
2004   return fold_comm_swap(J);
2005 }
2006 
2007 LJFOLD(BXOR any any)
2008 LJFOLDF(comm_bxor)
2009 {
2010   if (fins->op1 == fins->op2)  /* i xor i ==> 0 */
2011     return irt_is64(fins->t) ? INT64FOLD(0) : INTFOLD(0);
2012   return fold_comm_swap(J);
2013 }
2014 
2015 /* -- Simplification of compound expressions ------------------------------ */
2016 
2017 static TRef kfold_xload(jit_State *J, IRIns *ir, const void *p)
2018 {
2019   int32_t k;
2020   switch (irt_type(ir->t)) {
2021   case IRT_NUM: return lj_ir_knum_u64(J, *(uint64_t *)p);
2022   case IRT_I8: k = (int32_t)*(int8_t *)p; break;
2023   case IRT_U8: k = (int32_t)*(uint8_t *)p; break;
2024   case IRT_I16: k = (int32_t)(int16_t)lj_getu16(p); break;
2025   case IRT_U16: k = (int32_t)(uint16_t)lj_getu16(p); break;
2026   case IRT_INT: case IRT_U32: k = (int32_t)lj_getu32(p); break;
2027   case IRT_I64: case IRT_U64: return lj_ir_kint64(J, *(uint64_t *)p);
2028   default: return 0;
2029   }
2030   return lj_ir_kint(J, k);
2031 }
2032 
2033 /* Turn: string.sub(str, a, b) == kstr
2034 ** into: string.byte(str, a) == string.byte(kstr, 1) etc.
2035 ** Note: this creates unaligned XLOADs on x86/x64.
2036 */
2037 LJFOLD(EQ SNEW KGC)
2038 LJFOLD(NE SNEW KGC)
2039 LJFOLDF(merge_eqne_snew_kgc)
2040 {
2041   GCstr *kstr = ir_kstr(fright);
2042   int32_t len = (int32_t)kstr->len;
2043   lua_assert(irt_isstr(fins->t));
2044 
2045 #if LJ_TARGET_UNALIGNED
2046 #define FOLD_SNEW_MAX_LEN       4  /* Handle string lengths 0, 1, 2, 3, 4. */
2047 #define FOLD_SNEW_TYPE8         IRT_I8  /* Creates shorter immediates. */
2048 #else
2049 #define FOLD_SNEW_MAX_LEN       1  /* Handle string lengths 0 or 1. */
2050 #define FOLD_SNEW_TYPE8         IRT_U8  /* Prefer unsigned loads. */
2051 #endif
2052 
2053   PHIBARRIER(fleft);
2054   if (len <= FOLD_SNEW_MAX_LEN) {
2055     IROp op = (IROp)fins->o;
2056     IRRef strref = fleft->op1;
2057     if (IR(strref)->o != IR_STRREF)
2058       return NEXTFOLD;
2059     if (op == IR_EQ) {
2060       emitir(IRTGI(IR_EQ), fleft->op2, lj_ir_kint(J, len));
2061       /* Caveat: fins/fleft/fright is no longer valid after emitir. */
2062     } else {
2063       /* NE is not expanded since this would need an OR of two conds. */
2064       if (!irref_isk(fleft->op2))  /* Only handle the constant length case. */
2065         return NEXTFOLD;
2066       if (IR(fleft->op2)->i != len)
2067         return DROPFOLD;
2068     }
2069     if (len > 0) {
2070       /* A 4 byte load for length 3 is ok -- all strings have an extra NUL. */
2071       uint16_t ot = (uint16_t)(len == 1 ? IRT(IR_XLOAD, FOLD_SNEW_TYPE8) :
2072                                len == 2 ? IRT(IR_XLOAD, IRT_U16) :
2073                                IRTI(IR_XLOAD));
2074       TRef tmp = emitir(ot, strref,
2075                         IRXLOAD_READONLY | (len > 1 ? IRXLOAD_UNALIGNED : 0));
2076       TRef val = kfold_xload(J, IR(tref_ref(tmp)), strdata(kstr));
2077       if (len == 3)
2078         tmp = emitir(IRTI(IR_BAND), tmp,
2079                      lj_ir_kint(J, LJ_ENDIAN_SELECT(0x00ffffff, 0xffffff00)));
2080       fins->op1 = (IRRef1)tmp;
2081       fins->op2 = (IRRef1)val;
2082       fins->ot = (IROpT)IRTGI(op);
2083       return RETRYFOLD;
2084     } else {
2085       return DROPFOLD;
2086     }
2087   }
2088   return NEXTFOLD;
2089 }
2090 
2091 /* -- Loads --------------------------------------------------------------- */
2092 
2093 /* Loads cannot be folded or passed on to CSE in general.
2094 ** Alias analysis is needed to check for forwarding opportunities.
2095 **
2096 ** Caveat: *all* loads must be listed here or they end up at CSE!
2097 */
2098 
2099 LJFOLD(ALOAD any)
2100 LJFOLDX(lj_opt_fwd_aload)
2101 
2102 /* From HREF fwd (see below). Must eliminate, not supported by fwd/backend. */
2103 LJFOLD(HLOAD KKPTR)
2104 LJFOLDF(kfold_hload_kkptr)
2105 {
2106   UNUSED(J);
2107   lua_assert(ir_kptr(fleft) == niltvg(J2G(J)));
2108   return TREF_NIL;
2109 }
2110 
2111 LJFOLD(HLOAD any)
2112 LJFOLDX(lj_opt_fwd_hload)
2113 
2114 LJFOLD(ULOAD any)
2115 LJFOLDX(lj_opt_fwd_uload)
2116 
2117 LJFOLD(CALLL any IRCALL_lj_tab_len)
2118 LJFOLDX(lj_opt_fwd_tab_len)
2119 
2120 /* Upvalue refs are really loads, but there are no corresponding stores.
2121 ** So CSE is ok for them, except for UREFO across a GC step (see below).
2122 ** If the referenced function is const, its upvalue addresses are const, too.
2123 ** This can be used to improve CSE by looking for the same address,
2124 ** even if the upvalues originate from a different function.
2125 */
2126 LJFOLD(UREFO KGC any)
2127 LJFOLD(UREFC KGC any)
2128 LJFOLDF(cse_uref)
2129 {
2130   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
2131     IRRef ref = J->chain[fins->o];
2132     GCfunc *fn = ir_kfunc(fleft);
2133     GCupval *uv = gco2uv(gcref(fn->l.uvptr[(fins->op2 >> 8)]));
2134     while (ref > 0) {
2135       IRIns *ir = IR(ref);
2136       if (irref_isk(ir->op1)) {
2137         GCfunc *fn2 = ir_kfunc(IR(ir->op1));
2138         if (gco2uv(gcref(fn2->l.uvptr[(ir->op2 >> 8)])) == uv) {
2139           if (fins->o == IR_UREFO && gcstep_barrier(J, ref))
2140             break;
2141           return ref;
2142         }
2143       }
2144       ref = ir->prev;
2145     }
2146   }
2147   return EMITFOLD;
2148 }
2149 
2150 LJFOLD(HREFK any any)
2151 LJFOLDX(lj_opt_fwd_hrefk)
2152 
2153 LJFOLD(HREF TNEW any)
2154 LJFOLDF(fwd_href_tnew)
2155 {
2156   if (lj_opt_fwd_href_nokey(J))
2157     return lj_ir_kkptr(J, niltvg(J2G(J)));
2158   return NEXTFOLD;
2159 }
2160 
2161 LJFOLD(HREF TDUP KPRI)
2162 LJFOLD(HREF TDUP KGC)
2163 LJFOLD(HREF TDUP KNUM)
2164 LJFOLDF(fwd_href_tdup)
2165 {
2166   TValue keyv;
2167   lj_ir_kvalue(J->L, &keyv, fright);
2168   if (lj_tab_get(J->L, ir_ktab(IR(fleft->op1)), &keyv) == niltvg(J2G(J)) &&
2169       lj_opt_fwd_href_nokey(J))
2170     return lj_ir_kkptr(J, niltvg(J2G(J)));
2171   return NEXTFOLD;
2172 }
2173 
2174 /* We can safely FOLD/CSE array/hash refs and field loads, since there
2175 ** are no corresponding stores. But we need to check for any NEWREF with
2176 ** an aliased table, as it may invalidate all of the pointers and fields.
2177 ** Only HREF needs the NEWREF check -- AREF and HREFK already depend on
2178 ** FLOADs. And NEWREF itself is treated like a store (see below).
2179 ** LREF is constant (per trace) since coroutine switches are not inlined.
2180 */
2181 LJFOLD(FLOAD TNEW IRFL_TAB_ASIZE)
2182 LJFOLDF(fload_tab_tnew_asize)
2183 {
2184   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
2185     return INTFOLD(fleft->op1);
2186   return NEXTFOLD;
2187 }
2188 
2189 LJFOLD(FLOAD TNEW IRFL_TAB_HMASK)
2190 LJFOLDF(fload_tab_tnew_hmask)
2191 {
2192   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
2193     return INTFOLD((1 << fleft->op2)-1);
2194   return NEXTFOLD;
2195 }
2196 
2197 LJFOLD(FLOAD TDUP IRFL_TAB_ASIZE)
2198 LJFOLDF(fload_tab_tdup_asize)
2199 {
2200   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
2201     return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->asize);
2202   return NEXTFOLD;
2203 }
2204 
2205 LJFOLD(FLOAD TDUP IRFL_TAB_HMASK)
2206 LJFOLDF(fload_tab_tdup_hmask)
2207 {
2208   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && lj_opt_fwd_tptr(J, fins->op1))
2209     return INTFOLD((int32_t)ir_ktab(IR(fleft->op1))->hmask);
2210   return NEXTFOLD;
2211 }
2212 
2213 LJFOLD(HREF any any)
2214 LJFOLD(FLOAD any IRFL_TAB_ARRAY)
2215 LJFOLD(FLOAD any IRFL_TAB_NODE)
2216 LJFOLD(FLOAD any IRFL_TAB_ASIZE)
2217 LJFOLD(FLOAD any IRFL_TAB_HMASK)
2218 LJFOLDF(fload_tab_ah)
2219 {
2220   TRef tr = lj_opt_cse(J);
2221   return lj_opt_fwd_tptr(J, tref_ref(tr)) ? tr : EMITFOLD;
2222 }
2223 
2224 /* Strings are immutable, so we can safely FOLD/CSE the related FLOAD. */
2225 LJFOLD(FLOAD KGC IRFL_STR_LEN)
2226 LJFOLDF(fload_str_len_kgc)
2227 {
2228   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
2229     return INTFOLD((int32_t)ir_kstr(fleft)->len);
2230   return NEXTFOLD;
2231 }
2232 
2233 LJFOLD(FLOAD SNEW IRFL_STR_LEN)
2234 LJFOLDF(fload_str_len_snew)
2235 {
2236   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
2237     PHIBARRIER(fleft);
2238     return fleft->op2;
2239   }
2240   return NEXTFOLD;
2241 }
2242 
2243 LJFOLD(FLOAD TOSTR IRFL_STR_LEN)
2244 LJFOLDF(fload_str_len_tostr)
2245 {
2246   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD) && fleft->op2 == IRTOSTR_CHAR)
2247     return INTFOLD(1);
2248   return NEXTFOLD;
2249 }
2250 
2251 /* The C type ID of cdata objects is immutable. */
2252 LJFOLD(FLOAD KGC IRFL_CDATA_CTYPEID)
2253 LJFOLDF(fload_cdata_typeid_kgc)
2254 {
2255   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
2256     return INTFOLD((int32_t)ir_kcdata(fleft)->ctypeid);
2257   return NEXTFOLD;
2258 }
2259 
2260 /* Get the contents of immutable cdata objects. */
2261 LJFOLD(FLOAD KGC IRFL_CDATA_PTR)
2262 LJFOLD(FLOAD KGC IRFL_CDATA_INT)
2263 LJFOLD(FLOAD KGC IRFL_CDATA_INT64)
2264 LJFOLDF(fload_cdata_int64_kgc)
2265 {
2266   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD)) {
2267     void *p = cdataptr(ir_kcdata(fleft));
2268     if (irt_is64(fins->t))
2269       return INT64FOLD(*(uint64_t *)p);
2270     else
2271       return INTFOLD(*(int32_t *)p);
2272   }
2273   return NEXTFOLD;
2274 }
2275 
2276 LJFOLD(FLOAD CNEW IRFL_CDATA_CTYPEID)
2277 LJFOLD(FLOAD CNEWI IRFL_CDATA_CTYPEID)
2278 LJFOLDF(fload_cdata_typeid_cnew)
2279 {
2280   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
2281     return fleft->op1;  /* No PHI barrier needed. CNEW/CNEWI op1 is const. */
2282   return NEXTFOLD;
2283 }
2284 
2285 /* Pointer, int and int64 cdata objects are immutable. */
2286 LJFOLD(FLOAD CNEWI IRFL_CDATA_PTR)
2287 LJFOLD(FLOAD CNEWI IRFL_CDATA_INT)
2288 LJFOLD(FLOAD CNEWI IRFL_CDATA_INT64)
2289 LJFOLDF(fload_cdata_ptr_int64_cnew)
2290 {
2291   if (LJ_LIKELY(J->flags & JIT_F_OPT_FOLD))
2292     return fleft->op2;  /* Fold even across PHI to avoid allocations. */
2293   return NEXTFOLD;
2294 }
2295 
2296 LJFOLD(FLOAD any IRFL_STR_LEN)
2297 LJFOLD(FLOAD any IRFL_FUNC_ENV)
2298 LJFOLD(FLOAD any IRFL_THREAD_ENV)
2299 LJFOLD(FLOAD any IRFL_CDATA_CTYPEID)
2300 LJFOLD(FLOAD any IRFL_CDATA_PTR)
2301 LJFOLD(FLOAD any IRFL_CDATA_INT)
2302 LJFOLD(FLOAD any IRFL_CDATA_INT64)
2303 LJFOLD(VLOAD any any)  /* Vararg loads have no corresponding stores. */
2304 LJFOLDX(lj_opt_cse)
2305 
2306 /* All other field loads need alias analysis. */
2307 LJFOLD(FLOAD any any)
2308 LJFOLDX(lj_opt_fwd_fload)
2309 
2310 /* This is for LOOP only. Recording handles SLOADs internally. */
2311 LJFOLD(SLOAD any any)
2312 LJFOLDF(fwd_sload)
2313 {
2314   if ((fins->op2 & IRSLOAD_FRAME)) {
2315     TRef tr = lj_opt_cse(J);
2316     return tref_ref(tr) < J->chain[IR_RETF] ? EMITFOLD : tr;
2317   } else {
2318     lua_assert(J->slot[fins->op1] != 0);
2319     return J->slot[fins->op1];
2320   }
2321 }
2322 
2323 /* Only fold for KKPTR. The pointer _and_ the contents must be const. */
2324 LJFOLD(XLOAD KKPTR any)
2325 LJFOLDF(xload_kptr)
2326 {
2327   TRef tr = kfold_xload(J, fins, ir_kptr(fleft));
2328   return tr ? tr : NEXTFOLD;
2329 }
2330 
2331 LJFOLD(XLOAD any any)
2332 LJFOLDX(lj_opt_fwd_xload)
2333 
2334 /* -- Write barriers ------------------------------------------------------ */
2335 
2336 /* Write barriers are amenable to CSE, but not across any incremental
2337 ** GC steps.
2338 **
2339 ** The same logic applies to open upvalue references, because a stack
2340 ** may be resized during a GC step (not the current stack, but maybe that
2341 ** of a coroutine).
2342 */
2343 LJFOLD(TBAR any)
2344 LJFOLD(OBAR any any)
2345 LJFOLD(UREFO any any)
2346 LJFOLDF(barrier_tab)
2347 {
2348   TRef tr = lj_opt_cse(J);
2349   if (gcstep_barrier(J, tref_ref(tr)))  /* CSE across GC step? */
2350     return EMITFOLD;  /* Raw emit. Assumes fins is left intact by CSE. */
2351   return tr;
2352 }
2353 
2354 LJFOLD(TBAR TNEW)
2355 LJFOLD(TBAR TDUP)
2356 LJFOLDF(barrier_tnew_tdup)
2357 {
2358   /* New tables are always white and never need a barrier. */
2359   if (fins->op1 < J->chain[IR_LOOP])  /* Except across a GC step. */
2360     return NEXTFOLD;
2361   return DROPFOLD;
2362 }
2363 
2364 /* -- Profiling ----------------------------------------------------------- */
2365 
2366 LJFOLD(PROF any any)
2367 LJFOLDF(prof)
2368 {
2369   IRRef ref = J->chain[IR_PROF];
2370   if (ref+1 == J->cur.nins)  /* Drop neighbouring IR_PROF. */
2371     return ref;
2372   return EMITFOLD;
2373 }
2374 
2375 /* -- Stores and allocations ---------------------------------------------- */
2376 
2377 /* Stores and allocations cannot be folded or passed on to CSE in general.
2378 ** But some stores can be eliminated with dead-store elimination (DSE).
2379 **
2380 ** Caveat: *all* stores and allocs must be listed here or they end up at CSE!
2381 */
2382 
2383 LJFOLD(ASTORE any any)
2384 LJFOLD(HSTORE any any)
2385 LJFOLDX(lj_opt_dse_ahstore)
2386 
2387 LJFOLD(USTORE any any)
2388 LJFOLDX(lj_opt_dse_ustore)
2389 
2390 LJFOLD(FSTORE any any)
2391 LJFOLDX(lj_opt_dse_fstore)
2392 
2393 LJFOLD(XSTORE any any)
2394 LJFOLDX(lj_opt_dse_xstore)
2395 
2396 LJFOLD(NEWREF any any)  /* Treated like a store. */
2397 LJFOLD(CALLA any any)
2398 LJFOLD(CALLL any any)  /* Safeguard fallback. */
2399 LJFOLD(CALLS any any)
2400 LJFOLD(CALLXS any any)
2401 LJFOLD(XBAR)
2402 LJFOLD(RETF any any)  /* Modifies BASE. */
2403 LJFOLD(TNEW any any)
2404 LJFOLD(TDUP any)
2405 LJFOLD(CNEW any any)
2406 LJFOLD(XSNEW any any)
2407 LJFOLD(BUFHDR any any)
2408 LJFOLDX(lj_ir_emit)
2409 
2410 /* ------------------------------------------------------------------------ */
2411 
2412 /* Every entry in the generated hash table is a 32 bit pattern:
2413 **
2414 ** xxxxxxxx iiiiiii lllllll rrrrrrrrrr
2415 **
2416 **   xxxxxxxx = 8 bit index into fold function table
2417 **    iiiiiii = 7 bit folded instruction opcode
2418 **    lllllll = 7 bit left instruction opcode
2419 ** rrrrrrrrrr = 8 bit right instruction opcode or 10 bits from literal field
2420 */
2421 
2422 #include "lj_folddef.h"
2423 
2424 /* ------------------------------------------------------------------------ */
2425 
2426 /* Fold IR instruction. */
2427 TRef LJ_FASTCALL lj_opt_fold(jit_State *J)
2428 {
2429   uint32_t key, any;
2430   IRRef ref;
2431 
2432   if (LJ_UNLIKELY((J->flags & JIT_F_OPT_MASK) != JIT_F_OPT_DEFAULT)) {
2433     lua_assert(((JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE|JIT_F_OPT_DSE) |
2434                 JIT_F_OPT_DEFAULT) == JIT_F_OPT_DEFAULT);
2435     /* Folding disabled? Chain to CSE, but not for loads/stores/allocs. */
2436     if (!(J->flags & JIT_F_OPT_FOLD) && irm_kind(lj_ir_mode[fins->o]) == IRM_N)
2437       return lj_opt_cse(J);
2438 
2439     /* No FOLD, forwarding or CSE? Emit raw IR for loads, except for SLOAD. */
2440     if ((J->flags & (JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE)) !=
2441                     (JIT_F_OPT_FOLD|JIT_F_OPT_FWD|JIT_F_OPT_CSE) &&
2442         irm_kind(lj_ir_mode[fins->o]) == IRM_L && fins->o != IR_SLOAD)
2443       return lj_ir_emit(J);
2444 
2445     /* No FOLD or DSE? Emit raw IR for stores. */
2446     if ((J->flags & (JIT_F_OPT_FOLD|JIT_F_OPT_DSE)) !=
2447                     (JIT_F_OPT_FOLD|JIT_F_OPT_DSE) &&
2448         irm_kind(lj_ir_mode[fins->o]) == IRM_S)
2449       return lj_ir_emit(J);
2450   }
2451 
2452   /* Fold engine start/retry point. */
2453 retry:
2454   /* Construct key from opcode and operand opcodes (unless literal/none). */
2455   key = ((uint32_t)fins->o << 17);
2456   if (fins->op1 >= J->cur.nk) {
2457     key += (uint32_t)IR(fins->op1)->o << 10;
2458     *fleft = *IR(fins->op1);
2459     if (fins->op1 < REF_TRUE)
2460       fleft[1] = IR(fins->op1)[1];
2461   }
2462   if (fins->op2 >= J->cur.nk) {
2463     key += (uint32_t)IR(fins->op2)->o;
2464     *fright = *IR(fins->op2);
2465     if (fins->op2 < REF_TRUE)
2466       fright[1] = IR(fins->op2)[1];
2467   } else {
2468     key += (fins->op2 & 0x3ffu);  /* Literal mask. Must include IRCONV_*MASK. */
2469   }
2470 
2471   /* Check for a match in order from most specific to least specific. */
2472   any = 0;
2473   for (;;) {
2474     uint32_t k = key | (any & 0x1ffff);
2475     uint32_t h = fold_hashkey(k);
2476     uint32_t fh = fold_hash[h];  /* Lookup key in semi-perfect hash table. */
2477     if ((fh & 0xffffff) == k || (fh = fold_hash[h+1], (fh & 0xffffff) == k)) {
2478       ref = (IRRef)tref_ref(fold_func[fh >> 24](J));
2479       if (ref != NEXTFOLD)
2480         break;
2481     }
2482     if (any == 0xfffff)  /* Exhausted folding. Pass on to CSE. */
2483       return lj_opt_cse(J);
2484     any = (any | (any >> 10)) ^ 0xffc00;
2485   }
2486 
2487   /* Return value processing, ordered by frequency. */
2488   if (LJ_LIKELY(ref >= MAX_FOLD))
2489     return TREF(ref, irt_t(IR(ref)->t));
2490   if (ref == RETRYFOLD)
2491     goto retry;
2492   if (ref == KINTFOLD)
2493     return lj_ir_kint(J, fins->i);
2494   if (ref == FAILFOLD)
2495     lj_trace_err(J, LJ_TRERR_GFAIL);
2496   lua_assert(ref == DROPFOLD);
2497   return REF_DROP;
2498 }
2499 
2500 /* -- Common-Subexpression Elimination ------------------------------------ */
2501 
2502 /* CSE an IR instruction. This is very fast due to the skip-list chains. */
2503 TRef LJ_FASTCALL lj_opt_cse(jit_State *J)
2504 {
2505   /* Avoid narrow to wide store-to-load forwarding stall */
2506   IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
2507   IROp op = fins->o;
2508   if (LJ_LIKELY(J->flags & JIT_F_OPT_CSE)) {
2509     /* Limited search for same operands in per-opcode chain. */
2510     IRRef ref = J->chain[op];
2511     IRRef lim = fins->op1;
2512     if (fins->op2 > lim) lim = fins->op2;  /* Relies on lit < REF_BIAS. */
2513     while (ref > lim) {
2514       if (IR(ref)->op12 == op12)
2515         return TREF(ref, irt_t(IR(ref)->t));  /* Common subexpression found. */
2516       ref = IR(ref)->prev;
2517     }
2518   }
2519   /* Otherwise emit IR (inlined for speed). */
2520   {
2521     IRRef ref = lj_ir_nextins(J);
2522     IRIns *ir = IR(ref);
2523     ir->prev = J->chain[op];
2524     ir->op12 = op12;
2525     J->chain[op] = (IRRef1)ref;
2526     ir->o = fins->o;
2527     J->guardemit.irt |= fins->t.irt;
2528     return TREF(ref, irt_t((ir->t = fins->t)));
2529   }
2530 }
2531 
2532 /* CSE with explicit search limit. */
2533 TRef LJ_FASTCALL lj_opt_cselim(jit_State *J, IRRef lim)
2534 {
2535   IRRef ref = J->chain[fins->o];
2536   IRRef2 op12 = (IRRef2)fins->op1 + ((IRRef2)fins->op2 << 16);
2537   while (ref > lim) {
2538     if (IR(ref)->op12 == op12)
2539       return ref;
2540     ref = IR(ref)->prev;
2541   }
2542   return lj_ir_emit(J);
2543 }
2544 
2545 /* ------------------------------------------------------------------------ */
2546 
2547 #undef IR
2548 #undef fins
2549 #undef fleft
2550 #undef fright
2551 #undef knumleft
2552 #undef knumright
2553 #undef emitir
2554 
2555 #endif

/* [<][>][^][v][top][bottom][index][help] */