root/lj_lex.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fillbuf
  2. save_grow
  3. save
  4. inclinenumber
  5. lex_number
  6. skip_sep
  7. read_long_string
  8. read_string
  9. llex
  10. lj_lex_setup
  11. lj_lex_cleanup
  12. lj_lex_next
  13. lj_lex_lookahead
  14. lj_lex_token2str
  15. lj_lex_error
  16. lj_lex_init

   1 /*
   2 ** Lexical analyzer.
   3 ** Copyright (C) 2005-2017 Mike Pall. See Copyright Notice in luajit.h
   4 **
   5 ** Major portions taken verbatim or adapted from the Lua interpreter.
   6 ** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
   7 */
   8 
   9 #define lj_lex_c
  10 #define LUA_CORE
  11 
  12 #include "lj_obj.h"
  13 #include "lj_gc.h"
  14 #include "lj_err.h"
  15 #include "lj_str.h"
  16 #if LJ_HASFFI
  17 #include "lj_tab.h"
  18 #include "lj_ctype.h"
  19 #include "lj_cdata.h"
  20 #include "lualib.h"
  21 #endif
  22 #include "lj_state.h"
  23 #include "lj_lex.h"
  24 #include "lj_parse.h"
  25 #include "lj_char.h"
  26 #include "lj_strscan.h"
  27 
  28 /* Lua lexer token names. */
  29 static const char *const tokennames[] = {
  30 #define TKSTR1(name)            #name,
  31 #define TKSTR2(name, sym)       #sym,
  32 TKDEF(TKSTR1, TKSTR2)
  33 #undef TKSTR1
  34 #undef TKSTR2
  35   NULL
  36 };
  37 
  38 /* -- Buffer handling ----------------------------------------------------- */
  39 
  40 #define char2int(c)             ((int)(uint8_t)(c))
  41 #define next(ls) \
  42   (ls->current = (ls->n--) > 0 ? char2int(*ls->p++) : fillbuf(ls))
  43 #define save_and_next(ls)       (save(ls, ls->current), next(ls))
  44 #define currIsNewline(ls)       (ls->current == '\n' || ls->current == '\r')
  45 #define END_OF_STREAM           (-1)
  46 
  47 static int fillbuf(LexState *ls)
  48 {
  49   size_t sz;
  50   const char *buf = ls->rfunc(ls->L, ls->rdata, &sz);
  51   if (buf == NULL || sz == 0) return END_OF_STREAM;
  52   ls->n = (MSize)sz - 1;
  53   ls->p = buf;
  54   return char2int(*(ls->p++));
  55 }
  56 
  57 static LJ_NOINLINE void save_grow(LexState *ls, int c)
  58 {
  59   MSize newsize;
  60   if (ls->sb.sz >= LJ_MAX_STR/2)
  61     lj_lex_error(ls, 0, LJ_ERR_XELEM);
  62   newsize = ls->sb.sz * 2;
  63   lj_str_resizebuf(ls->L, &ls->sb, newsize);
  64   ls->sb.buf[ls->sb.n++] = (char)c;
  65 }
  66 
  67 static LJ_AINLINE void save(LexState *ls, int c)
  68 {
  69   if (LJ_UNLIKELY(ls->sb.n + 1 > ls->sb.sz))
  70     save_grow(ls, c);
  71   else
  72     ls->sb.buf[ls->sb.n++] = (char)c;
  73 }
  74 
  75 static void inclinenumber(LexState *ls)
  76 {
  77   int old = ls->current;
  78   lua_assert(currIsNewline(ls));
  79   next(ls);  /* skip `\n' or `\r' */
  80   if (currIsNewline(ls) && ls->current != old)
  81     next(ls);  /* skip `\n\r' or `\r\n' */
  82   if (++ls->linenumber >= LJ_MAX_LINE)
  83     lj_lex_error(ls, ls->token, LJ_ERR_XLINES);
  84 }
  85 
  86 /* -- Scanner for terminals ----------------------------------------------- */
  87 
  88 /* Parse a number literal. */
  89 static void lex_number(LexState *ls, TValue *tv)
  90 {
  91   StrScanFmt fmt;
  92   int c, xp = 'e';
  93   lua_assert(lj_char_isdigit(ls->current));
  94   if ((c = ls->current) == '0') {
  95     save_and_next(ls);
  96     if ((ls->current | 0x20) == 'x') xp = 'p';
  97   }
  98   while (lj_char_isident(ls->current) || ls->current == '.' ||
  99          ((ls->current == '-' || ls->current == '+') && (c | 0x20) == xp)) {
 100     c = ls->current;
 101     save_and_next(ls);
 102   }
 103   save(ls, '\0');
 104   fmt = lj_strscan_scan((const uint8_t *)ls->sb.buf, tv,
 105           (LJ_DUALNUM ? STRSCAN_OPT_TOINT : STRSCAN_OPT_TONUM) |
 106           (LJ_HASFFI ? (STRSCAN_OPT_LL|STRSCAN_OPT_IMAG) : 0));
 107   if (LJ_DUALNUM && fmt == STRSCAN_INT) {
 108     setitype(tv, LJ_TISNUM);
 109   } else if (fmt == STRSCAN_NUM) {
 110     /* Already in correct format. */
 111 #if LJ_HASFFI
 112   } else if (fmt != STRSCAN_ERROR) {
 113     lua_State *L = ls->L;
 114     GCcdata *cd;
 115     lua_assert(fmt == STRSCAN_I64 || fmt == STRSCAN_U64 || fmt == STRSCAN_IMAG);
 116     if (!ctype_ctsG(G(L))) {
 117       ptrdiff_t oldtop = savestack(L, L->top);
 118       luaopen_ffi(L);  /* Load FFI library on-demand. */
 119       L->top = restorestack(L, oldtop);
 120     }
 121     if (fmt == STRSCAN_IMAG) {
 122       cd = lj_cdata_new_(L, CTID_COMPLEX_DOUBLE, 2*sizeof(double));
 123       ((double *)cdataptr(cd))[0] = 0;
 124       ((double *)cdataptr(cd))[1] = numV(tv);
 125     } else {
 126       cd = lj_cdata_new_(L, fmt==STRSCAN_I64 ? CTID_INT64 : CTID_UINT64, 8);
 127       *(uint64_t *)cdataptr(cd) = tv->u64;
 128     }
 129     lj_parse_keepcdata(ls, tv, cd);
 130 #endif
 131   } else {
 132     lua_assert(fmt == STRSCAN_ERROR);
 133     lj_lex_error(ls, TK_number, LJ_ERR_XNUMBER);
 134   }
 135 }
 136 
 137 static int skip_sep(LexState *ls)
 138 {
 139   int count = 0;
 140   int s = ls->current;
 141   lua_assert(s == '[' || s == ']');
 142   save_and_next(ls);
 143   while (ls->current == '=') {
 144     save_and_next(ls);
 145     count++;
 146   }
 147   return (ls->current == s) ? count : (-count) - 1;
 148 }
 149 
 150 static void read_long_string(LexState *ls, TValue *tv, int sep)
 151 {
 152   save_and_next(ls);  /* skip 2nd `[' */
 153   if (currIsNewline(ls))  /* string starts with a newline? */
 154     inclinenumber(ls);  /* skip it */
 155   for (;;) {
 156     switch (ls->current) {
 157     case END_OF_STREAM:
 158       lj_lex_error(ls, TK_eof, tv ? LJ_ERR_XLSTR : LJ_ERR_XLCOM);
 159       break;
 160     case ']':
 161       if (skip_sep(ls) == sep) {
 162         save_and_next(ls);  /* skip 2nd `]' */
 163         goto endloop;
 164       }
 165       break;
 166     case '\n':
 167     case '\r':
 168       save(ls, '\n');
 169       inclinenumber(ls);
 170       if (!tv) lj_str_resetbuf(&ls->sb);  /* avoid wasting space */
 171       break;
 172     default:
 173       if (tv) save_and_next(ls);
 174       else next(ls);
 175       break;
 176     }
 177   } endloop:
 178   if (tv) {
 179     GCstr *str = lj_parse_keepstr(ls, ls->sb.buf + (2 + (MSize)sep),
 180                                       ls->sb.n - 2*(2 + (MSize)sep));
 181     setstrV(ls->L, tv, str);
 182   }
 183 }
 184 
 185 static void read_string(LexState *ls, int delim, TValue *tv)
 186 {
 187   save_and_next(ls);
 188   while (ls->current != delim) {
 189     switch (ls->current) {
 190     case END_OF_STREAM:
 191       lj_lex_error(ls, TK_eof, LJ_ERR_XSTR);
 192       continue;
 193     case '\n':
 194     case '\r':
 195       lj_lex_error(ls, TK_string, LJ_ERR_XSTR);
 196       continue;
 197     case '\\': {
 198       int c = next(ls);  /* Skip the '\\'. */
 199       switch (c) {
 200       case 'a': c = '\a'; break;
 201       case 'b': c = '\b'; break;
 202       case 'f': c = '\f'; break;
 203       case 'n': c = '\n'; break;
 204       case 'r': c = '\r'; break;
 205       case 't': c = '\t'; break;
 206       case 'v': c = '\v'; break;
 207       case 'x':  /* Hexadecimal escape '\xXX'. */
 208         c = (next(ls) & 15u) << 4;
 209         if (!lj_char_isdigit(ls->current)) {
 210           if (!lj_char_isxdigit(ls->current)) goto err_xesc;
 211           c += 9 << 4;
 212         }
 213         c += (next(ls) & 15u);
 214         if (!lj_char_isdigit(ls->current)) {
 215           if (!lj_char_isxdigit(ls->current)) goto err_xesc;
 216           c += 9;
 217         }
 218         break;
 219       case 'z':  /* Skip whitespace. */
 220         next(ls);
 221         while (lj_char_isspace(ls->current))
 222           if (currIsNewline(ls)) inclinenumber(ls); else next(ls);
 223         continue;
 224       case '\n': case '\r': save(ls, '\n'); inclinenumber(ls); continue;
 225       case '\\': case '\"': case '\'': break;
 226       case END_OF_STREAM: continue;
 227       default:
 228         if (!lj_char_isdigit(c))
 229           goto err_xesc;
 230         c -= '0';  /* Decimal escape '\ddd'. */
 231         if (lj_char_isdigit(next(ls))) {
 232           c = c*10 + (ls->current - '0');
 233           if (lj_char_isdigit(next(ls))) {
 234             c = c*10 + (ls->current - '0');
 235             if (c > 255) {
 236             err_xesc:
 237               lj_lex_error(ls, TK_string, LJ_ERR_XESC);
 238             }
 239             next(ls);
 240           }
 241         }
 242         save(ls, c);
 243         continue;
 244       }
 245       save(ls, c);
 246       next(ls);
 247       continue;
 248       }
 249     default:
 250       save_and_next(ls);
 251       break;
 252     }
 253   }
 254   save_and_next(ls);  /* skip delimiter */
 255   setstrV(ls->L, tv, lj_parse_keepstr(ls, ls->sb.buf + 1, ls->sb.n - 2));
 256 }
 257 
 258 /* -- Main lexical scanner ------------------------------------------------ */
 259 
 260 static int llex(LexState *ls, TValue *tv)
 261 {
 262   lj_str_resetbuf(&ls->sb);
 263   for (;;) {
 264     if (lj_char_isident(ls->current)) {
 265       GCstr *s;
 266       if (lj_char_isdigit(ls->current)) {  /* Numeric literal. */
 267         lex_number(ls, tv);
 268         return TK_number;
 269       }
 270       /* Identifier or reserved word. */
 271       do {
 272         save_and_next(ls);
 273       } while (lj_char_isident(ls->current));
 274       s = lj_parse_keepstr(ls, ls->sb.buf, ls->sb.n);
 275       setstrV(ls->L, tv, s);
 276       if (s->reserved > 0)  /* Reserved word? */
 277         return TK_OFS + s->reserved;
 278       return TK_name;
 279     }
 280     switch (ls->current) {
 281     case '\n':
 282     case '\r':
 283       inclinenumber(ls);
 284       continue;
 285     case ' ':
 286     case '\t':
 287     case '\v':
 288     case '\f':
 289       next(ls);
 290       continue;
 291     case '-':
 292       next(ls);
 293       if (ls->current != '-') return '-';
 294       /* else is a comment */
 295       next(ls);
 296       if (ls->current == '[') {
 297         int sep = skip_sep(ls);
 298         lj_str_resetbuf(&ls->sb);  /* `skip_sep' may dirty the buffer */
 299         if (sep >= 0) {
 300           read_long_string(ls, NULL, sep);  /* long comment */
 301           lj_str_resetbuf(&ls->sb);
 302           continue;
 303         }
 304       }
 305       /* else short comment */
 306       while (!currIsNewline(ls) && ls->current != END_OF_STREAM)
 307         next(ls);
 308       continue;
 309     case '[': {
 310       int sep = skip_sep(ls);
 311       if (sep >= 0) {
 312         read_long_string(ls, tv, sep);
 313         return TK_string;
 314       } else if (sep == -1) {
 315         return '[';
 316       } else {
 317         lj_lex_error(ls, TK_string, LJ_ERR_XLDELIM);
 318         continue;
 319       }
 320       }
 321     case '=':
 322       next(ls);
 323       if (ls->current != '=') return '='; else { next(ls); return TK_eq; }
 324     case '<':
 325       next(ls);
 326       if (ls->current != '=') return '<'; else { next(ls); return TK_le; }
 327     case '>':
 328       next(ls);
 329       if (ls->current != '=') return '>'; else { next(ls); return TK_ge; }
 330     case '~':
 331       next(ls);
 332       if (ls->current != '=') return '~'; else { next(ls); return TK_ne; }
 333     case ':':
 334       next(ls);
 335       if (ls->current != ':') return ':'; else { next(ls); return TK_label; }
 336     case '"':
 337     case '\'':
 338       read_string(ls, ls->current, tv);
 339       return TK_string;
 340     case '.':
 341       save_and_next(ls);
 342       if (ls->current == '.') {
 343         next(ls);
 344         if (ls->current == '.') {
 345           next(ls);
 346           return TK_dots;   /* ... */
 347         }
 348         return TK_concat;   /* .. */
 349       } else if (!lj_char_isdigit(ls->current)) {
 350         return '.';
 351       } else {
 352         lex_number(ls, tv);
 353         return TK_number;
 354       }
 355     case END_OF_STREAM:
 356       return TK_eof;
 357     default: {
 358       int c = ls->current;
 359       next(ls);
 360       return c;  /* Single-char tokens (+ - / ...). */
 361     }
 362     }
 363   }
 364 }
 365 
 366 /* -- Lexer API ----------------------------------------------------------- */
 367 
 368 /* Setup lexer state. */
 369 int lj_lex_setup(lua_State *L, LexState *ls)
 370 {
 371   int header = 0;
 372   ls->L = L;
 373   ls->fs = NULL;
 374   ls->n = 0;
 375   ls->p = NULL;
 376   ls->vstack = NULL;
 377   ls->sizevstack = 0;
 378   ls->vtop = 0;
 379   ls->bcstack = NULL;
 380   ls->sizebcstack = 0;
 381   ls->token = 0;
 382   ls->lookahead = TK_eof;  /* No look-ahead token. */
 383   ls->linenumber = 1;
 384   ls->lastline = 1;
 385   lj_str_resizebuf(ls->L, &ls->sb, LJ_MIN_SBUF);
 386   next(ls);  /* Read-ahead first char. */
 387   if (ls->current == 0xef && ls->n >= 2 && char2int(ls->p[0]) == 0xbb &&
 388       char2int(ls->p[1]) == 0xbf) {  /* Skip UTF-8 BOM (if buffered). */
 389     ls->n -= 2;
 390     ls->p += 2;
 391     next(ls);
 392     header = 1;
 393   }
 394   if (ls->current == '#') {  /* Skip POSIX #! header line. */
 395     do {
 396       next(ls);
 397       if (ls->current == END_OF_STREAM) return 0;
 398     } while (!currIsNewline(ls));
 399     inclinenumber(ls);
 400     header = 1;
 401   }
 402   if (ls->current == LUA_SIGNATURE[0]) {  /* Bytecode dump. */
 403     if (header) {
 404       /*
 405       ** Loading bytecode with an extra header is disabled for security
 406       ** reasons. This may circumvent the usual check for bytecode vs.
 407       ** Lua code by looking at the first char. Since this is a potential
 408       ** security violation no attempt is made to echo the chunkname either.
 409       */
 410       setstrV(L, L->top++, lj_err_str(L, LJ_ERR_BCBAD));
 411       lj_err_throw(L, LUA_ERRSYNTAX);
 412     }
 413     return 1;
 414   }
 415   return 0;
 416 }
 417 
 418 /* Cleanup lexer state. */
 419 void lj_lex_cleanup(lua_State *L, LexState *ls)
 420 {
 421   global_State *g = G(L);
 422   lj_mem_freevec(g, ls->bcstack, ls->sizebcstack, BCInsLine);
 423   lj_mem_freevec(g, ls->vstack, ls->sizevstack, VarInfo);
 424   lj_str_freebuf(g, &ls->sb);
 425 }
 426 
 427 void lj_lex_next(LexState *ls)
 428 {
 429   ls->lastline = ls->linenumber;
 430   if (LJ_LIKELY(ls->lookahead == TK_eof)) {  /* No lookahead token? */
 431     ls->token = llex(ls, &ls->tokenval);  /* Get next token. */
 432   } else {  /* Otherwise return lookahead token. */
 433     ls->token = ls->lookahead;
 434     ls->lookahead = TK_eof;
 435     ls->tokenval = ls->lookaheadval;
 436   }
 437 }
 438 
 439 LexToken lj_lex_lookahead(LexState *ls)
 440 {
 441   lua_assert(ls->lookahead == TK_eof);
 442   ls->lookahead = llex(ls, &ls->lookaheadval);
 443   return ls->lookahead;
 444 }
 445 
 446 const char *lj_lex_token2str(LexState *ls, LexToken token)
 447 {
 448   if (token > TK_OFS)
 449     return tokennames[token-TK_OFS-1];
 450   else if (!lj_char_iscntrl(token))
 451     return lj_str_pushf(ls->L, "%c", token);
 452   else
 453     return lj_str_pushf(ls->L, "char(%d)", token);
 454 }
 455 
 456 void lj_lex_error(LexState *ls, LexToken token, ErrMsg em, ...)
 457 {
 458   const char *tok;
 459   va_list argp;
 460   if (token == 0) {
 461     tok = NULL;
 462   } else if (token == TK_name || token == TK_string || token == TK_number) {
 463     save(ls, '\0');
 464     tok = ls->sb.buf;
 465   } else {
 466     tok = lj_lex_token2str(ls, token);
 467   }
 468   va_start(argp, em);
 469   lj_err_lex(ls->L, ls->chunkname, tok, ls->linenumber, em, argp);
 470   va_end(argp);
 471 }
 472 
 473 void lj_lex_init(lua_State *L)
 474 {
 475   uint32_t i;
 476   for (i = 0; i < TK_RESERVED; i++) {
 477     GCstr *s = lj_str_newz(L, tokennames[i]);
 478     fixstring(s);  /* Reserved words are never collected. */
 479     s->reserved = (uint8_t)(i+1);
 480   }
 481 }
 482 

/* [<][>][^][v][top][bottom][index][help] */