Sweden-Number/dlls/jscript/regexp.c

3987 lines
128 KiB
C

/*
* Copyright 2008 Jacek Caban for CodeWeavers
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
*/
/*
* Code in this file is based on files:
* js/src/jsregexp.h
* js/src/jsregexp.c
* from Mozilla project, released under LGPL 2.1 or later.
*
* The Original Code is Mozilla Communicator client code, released
* March 31, 1998.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*/
#include <assert.h>
#include <math.h>
#include "jscript.h"
#include "wine/debug.h"
WINE_DEFAULT_DEBUG_CHANNEL(jscript);
#define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
#define JSREG_GLOB 0x02 /* global exec, creates array of matches */
#define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
#define JSREG_STICKY 0x08 /* only match starting at lastIndex */
typedef BYTE JSPackedBool;
typedef BYTE jsbytecode;
/*
* This struct holds a bitmap representation of a class from a regexp.
* There's a list of these referenced by the classList field in the JSRegExp
* struct below. The initial state has startIndex set to the offset in the
* original regexp source of the beginning of the class contents. The first
* use of the class converts the source representation into a bitmap.
*
*/
typedef struct RECharSet {
JSPackedBool converted;
JSPackedBool sense;
WORD length;
union {
BYTE *bits;
struct {
size_t startIndex;
size_t length;
} src;
} u;
} RECharSet;
typedef struct {
WORD flags; /* flags, see jsapi.h's JSREG_* defines */
size_t parenCount; /* number of parenthesized submatches */
size_t classCount; /* count [...] bitmaps */
RECharSet *classList; /* list of [...] bitmaps */
BSTR source; /* locked source string, sans // */
jsbytecode program[1]; /* regular expression bytecode */
} JSRegExp;
typedef struct {
DispatchEx dispex;
JSRegExp *jsregexp;
BSTR str;
INT last_index;
VARIANT last_index_var;
} RegExpInstance;
static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
static const WCHAR execW[] = {'e','x','e','c',0};
static const WCHAR testW[] = {'t','e','s','t',0};
static const WCHAR emptyW[] = {0};
/* FIXME: Better error handling */
#define ReportRegExpError(a,b,c)
#define ReportRegExpErrorHelper(a,b,c,d)
#define JS_ReportErrorNumber(a,b,c,d)
#define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
#define js_ReportOutOfScriptQuota(a)
#define JS_ReportOutOfMemory(a)
#define JS_COUNT_OPERATION(a,b)
#define JSMSG_MIN_TOO_BIG 47
#define JSMSG_MAX_TOO_BIG 48
#define JSMSG_OUT_OF_ORDER 49
#define JSMSG_OUT_OF_MEMORY 137
#define LINE_SEPARATOR 0x2028
#define PARA_SEPARATOR 0x2029
#define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
((c >= 'a') && (c <= 'z')) )
#define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
(c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
#define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
#define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
#define JS7_UNDEC(c) ((c) - '0')
typedef enum REOp {
REOP_EMPTY,
REOP_BOL,
REOP_EOL,
REOP_WBDRY,
REOP_WNONBDRY,
REOP_DOT,
REOP_DIGIT,
REOP_NONDIGIT,
REOP_ALNUM,
REOP_NONALNUM,
REOP_SPACE,
REOP_NONSPACE,
REOP_BACKREF,
REOP_FLAT,
REOP_FLAT1,
REOP_FLATi,
REOP_FLAT1i,
REOP_UCFLAT1,
REOP_UCFLAT1i,
REOP_UCFLAT,
REOP_UCFLATi,
REOP_CLASS,
REOP_NCLASS,
REOP_ALT,
REOP_QUANT,
REOP_STAR,
REOP_PLUS,
REOP_OPT,
REOP_LPAREN,
REOP_RPAREN,
REOP_JUMP,
REOP_DOTSTAR,
REOP_LPARENNON,
REOP_ASSERT,
REOP_ASSERT_NOT,
REOP_ASSERTTEST,
REOP_ASSERTNOTTEST,
REOP_MINIMALSTAR,
REOP_MINIMALPLUS,
REOP_MINIMALOPT,
REOP_MINIMALQUANT,
REOP_ENDCHILD,
REOP_REPEAT,
REOP_MINIMALREPEAT,
REOP_ALTPREREQ,
REOP_ALTPREREQ2,
REOP_ENDALT,
REOP_CONCAT,
REOP_END,
REOP_LIMIT /* META: no operator >= to this */
} REOp;
#define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
static const char *reop_names[] = {
"empty",
"bol",
"eol",
"wbdry",
"wnonbdry",
"dot",
"digit",
"nondigit",
"alnum",
"nonalnum",
"space",
"nonspace",
"backref",
"flat",
"flat1",
"flati",
"flat1i",
"ucflat1",
"ucflat1i",
"ucflat",
"ucflati",
"class",
"nclass",
"alt",
"quant",
"star",
"plus",
"opt",
"lparen",
"rparen",
"jump",
"dotstar",
"lparennon",
"assert",
"assert_not",
"asserttest",
"assertnottest",
"minimalstar",
"minimalplus",
"minimalopt",
"minimalquant",
"endchild",
"repeat",
"minimalrepeat",
"altprereq",
"alrprereq2",
"endalt",
"concat",
"end",
NULL
};
typedef struct RECapture {
ptrdiff_t index; /* start of contents, -1 for empty */
size_t length; /* length of capture */
} RECapture;
typedef struct REMatchState {
const WCHAR *cp;
RECapture parens[1]; /* first of 're->parenCount' captures,
allocated at end of this struct */
} REMatchState;
typedef struct REProgState {
jsbytecode *continue_pc; /* current continuation data */
jsbytecode continue_op;
ptrdiff_t index; /* progress in text */
size_t parenSoFar; /* highest indexed paren started */
union {
struct {
UINT min; /* current quantifier limits */
UINT max;
} quantifier;
struct {
size_t top; /* backtrack stack state */
size_t sz;
} assertion;
} u;
} REProgState;
typedef struct REBackTrackData {
size_t sz; /* size of previous stack entry */
jsbytecode *backtrack_pc; /* where to backtrack to */
jsbytecode backtrack_op;
const WCHAR *cp; /* index in text of match at backtrack */
size_t parenIndex; /* start index of saved paren contents */
size_t parenCount; /* # of saved paren contents */
size_t saveStateStackTop; /* number of parent states */
/* saved parent states follow */
/* saved paren contents follow */
} REBackTrackData;
#define INITIAL_STATESTACK 100
#define INITIAL_BACKTRACK 8000
typedef struct REGlobalData {
script_ctx_t *cx;
JSRegExp *regexp; /* the RE in execution */
BOOL ok; /* runtime error (out_of_memory only?) */
size_t start; /* offset to start at */
ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
const WCHAR *cpbegin; /* text base address */
const WCHAR *cpend; /* text limit address */
REProgState *stateStack; /* stack of state of current parents */
size_t stateStackTop;
size_t stateStackLimit;
REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
REBackTrackData *backTrackSP;
size_t backTrackStackSize;
size_t cursz; /* size of current stack entry */
size_t backTrackCount; /* how many times we've backtracked */
size_t backTrackLimit; /* upper limit on backtrack states */
jsheap_t *pool; /* It's faster to use one malloc'd pool
than to malloc/free the three items
that are allocated from this pool */
} REGlobalData;
typedef struct RENode RENode;
struct RENode {
REOp op; /* r.e. op bytecode */
RENode *next; /* next in concatenation order */
void *kid; /* first operand */
union {
void *kid2; /* second operand */
INT num; /* could be a number */
size_t parenIndex; /* or a parenthesis index */
struct { /* or a quantifier range */
UINT min;
UINT max;
JSPackedBool greedy;
} range;
struct { /* or a character class */
size_t startIndex;
size_t kidlen; /* length of string at kid, in jschars */
size_t index; /* index into class list */
WORD bmsize; /* bitmap size, based on max char code */
JSPackedBool sense;
} ucclass;
struct { /* or a literal sequence */
WCHAR chr; /* of one character */
size_t length; /* or many (via the kid) */
} flat;
struct {
RENode *kid2; /* second operand from ALT */
WCHAR ch1; /* match char for ALTPREREQ */
WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
} altprereq;
} u;
};
#define CLASS_CACHE_SIZE 4
typedef struct CompilerState {
script_ctx_t *context;
const WCHAR *cpbegin;
const WCHAR *cpend;
const WCHAR *cp;
size_t parenCount;
size_t classCount; /* number of [] encountered */
size_t treeDepth; /* maximum depth of parse tree */
size_t progLength; /* estimated bytecode length */
RENode *result;
size_t classBitmapsMem; /* memory to hold all class bitmaps */
struct {
const WCHAR *start; /* small cache of class strings */
size_t length; /* since they're often the same */
size_t index;
} classCache[CLASS_CACHE_SIZE];
WORD flags;
} CompilerState;
typedef struct EmitStateStackEntry {
jsbytecode *altHead; /* start of REOP_ALT* opcode */
jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
RENode *continueNode; /* original REOP_ALT* node being stacked */
jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
avoid 16-bit unsigned offset overflow */
} EmitStateStackEntry;
/*
* Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
* the getters and setters take the pc of the offset, not of the opcode before
* the offset.
*/
#define ARG_LEN 2
#define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
#define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
(pc)[1] = (jsbytecode) (arg))
#define OFFSET_LEN ARG_LEN
#define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
#define GET_OFFSET(pc) GET_ARG(pc)
static BOOL ParseRegExp(CompilerState*);
/*
* Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
* For sanity, we limit it to 2^24 bytes.
*/
#define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
/*
* The maximum memory that can be allocated for class bitmaps.
* For sanity, we limit it to 2^24 bytes.
*/
#define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
/*
* Functions to get size and write/read bytecode that represent small indexes
* compactly.
* Each byte in the code represent 7-bit chunk of the index. 8th bit when set
* indicates that the following byte brings more bits to the index. Otherwise
* this is the last byte in the index bytecode representing highest index bits.
*/
static size_t
GetCompactIndexWidth(size_t index)
{
size_t width;
for (width = 1; (index >>= 7) != 0; ++width) { }
return width;
}
static inline jsbytecode *
WriteCompactIndex(jsbytecode *pc, size_t index)
{
size_t next;
while ((next = index >> 7) != 0) {
*pc++ = (jsbytecode)(index | 0x80);
index = next;
}
*pc++ = (jsbytecode)index;
return pc;
}
static inline jsbytecode *
ReadCompactIndex(jsbytecode *pc, size_t *result)
{
size_t nextByte;
nextByte = *pc++;
if ((nextByte & 0x80) == 0) {
/*
* Short-circuit the most common case when compact index <= 127.
*/
*result = nextByte;
} else {
size_t shift = 7;
*result = 0x7F & nextByte;
do {
nextByte = *pc++;
*result |= (nextByte & 0x7F) << shift;
shift += 7;
} while ((nextByte & 0x80) != 0);
}
return pc;
}
/* Construct and initialize an RENode, returning NULL for out-of-memory */
static RENode *
NewRENode(CompilerState *state, REOp op)
{
RENode *ren;
ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
if (!ren) {
/* js_ReportOutOfScriptQuota(cx); */
return NULL;
}
ren->op = op;
ren->next = NULL;
ren->kid = NULL;
return ren;
}
/*
* Validates and converts hex ascii value.
*/
static BOOL
isASCIIHexDigit(WCHAR c, UINT *digit)
{
UINT cv = c;
if (cv < '0')
return FALSE;
if (cv <= '9') {
*digit = cv - '0';
return TRUE;
}
cv |= 0x20;
if (cv >= 'a' && cv <= 'f') {
*digit = cv - 'a' + 10;
return TRUE;
}
return FALSE;
}
typedef struct {
REOp op;
const WCHAR *errPos;
size_t parenIndex;
} REOpData;
#define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
#define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
static BOOL
SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
{
ptrdiff_t offset = target - jump;
/* Check that target really points forward. */
assert(offset >= 2);
if ((size_t)offset > OFFSET_MAX)
return FALSE;
jump[0] = JUMP_OFFSET_HI(offset);
jump[1] = JUMP_OFFSET_LO(offset);
return TRUE;
}
/*
* Generate bytecode for the tree rooted at t using an explicit stack instead
* of recursion.
*/
static jsbytecode *
EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
jsbytecode *pc, RENode *t)
{
EmitStateStackEntry *emitStateSP, *emitStateStack;
RECharSet *charSet;
REOp op;
if (treeDepth == 0) {
emitStateStack = NULL;
} else {
emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
if (!emitStateStack)
return NULL;
}
emitStateSP = emitStateStack;
op = t->op;
assert(op < REOP_LIMIT);
for (;;) {
*pc++ = op;
switch (op) {
case REOP_EMPTY:
--pc;
break;
case REOP_ALTPREREQ2:
case REOP_ALTPREREQ:
assert(emitStateSP);
emitStateSP->altHead = pc - 1;
emitStateSP->endTermFixup = pc;
pc += OFFSET_LEN;
SET_ARG(pc, t->u.altprereq.ch1);
pc += ARG_LEN;
SET_ARG(pc, t->u.altprereq.ch2);
pc += ARG_LEN;
emitStateSP->nextAltFixup = pc; /* offset to next alternate */
pc += OFFSET_LEN;
emitStateSP->continueNode = t;
emitStateSP->continueOp = REOP_JUMP;
emitStateSP->jumpToJumpFlag = FALSE;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->kid;
op = t->op;
assert(op < REOP_LIMIT);
continue;
case REOP_JUMP:
emitStateSP->nextTermFixup = pc; /* offset to following term */
pc += OFFSET_LEN;
if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
goto jump_too_big;
emitStateSP->continueOp = REOP_ENDALT;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->u.kid2;
op = t->op;
assert(op < REOP_LIMIT);
continue;
case REOP_ENDALT:
/*
* If we already patched emitStateSP->nextTermFixup to jump to
* a nearer jump, to avoid 16-bit immediate offset overflow, we
* are done here.
*/
if (emitStateSP->jumpToJumpFlag)
break;
/*
* Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
* REOP_ENDALT is executed only on successful match of the last
* alternate in a group.
*/
if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
goto jump_too_big;
if (t->op != REOP_ALT) {
if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
goto jump_too_big;
}
/*
* If the program is bigger than the REOP_JUMP offset range, then
* we must check for alternates before this one that are part of
* the same group, and fix up their jump offsets to target jumps
* close enough to fit in a 16-bit unsigned offset immediate.
*/
if ((size_t)(pc - re->program) > OFFSET_MAX &&
emitStateSP > emitStateStack) {
EmitStateStackEntry *esp, *esp2;
jsbytecode *alt, *jump;
ptrdiff_t span, header;
esp2 = emitStateSP;
alt = esp2->altHead;
for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
if (esp->continueOp == REOP_ENDALT &&
!esp->jumpToJumpFlag &&
esp->nextTermFixup + OFFSET_LEN == alt &&
(size_t)(pc - ((esp->continueNode->op != REOP_ALT)
? esp->endTermFixup
: esp->nextTermFixup)) > OFFSET_MAX) {
alt = esp->altHead;
jump = esp->nextTermFixup;
/*
* The span must be 1 less than the distance from
* jump offset to jump offset, so we actually jump
* to a REOP_JUMP bytecode, not to its offset!
*/
for (;;) {
assert(jump < esp2->nextTermFixup);
span = esp2->nextTermFixup - jump - 1;
if ((size_t)span <= OFFSET_MAX)
break;
do {
if (--esp2 == esp)
goto jump_too_big;
} while (esp2->continueOp != REOP_ENDALT);
}
jump[0] = JUMP_OFFSET_HI(span);
jump[1] = JUMP_OFFSET_LO(span);
if (esp->continueNode->op != REOP_ALT) {
/*
* We must patch the offset at esp->endTermFixup
* as well, for the REOP_ALTPREREQ{,2} opcodes.
* If we're unlucky and endTermFixup is more than
* OFFSET_MAX bytes from its target, we cheat by
* jumping 6 bytes to the jump whose offset is at
* esp->nextTermFixup, which has the same target.
*/
jump = esp->endTermFixup;
header = esp->nextTermFixup - jump;
span += header;
if ((size_t)span > OFFSET_MAX)
span = header;
jump[0] = JUMP_OFFSET_HI(span);
jump[1] = JUMP_OFFSET_LO(span);
}
esp->jumpToJumpFlag = TRUE;
}
}
}
break;
case REOP_ALT:
assert(emitStateSP);
emitStateSP->altHead = pc - 1;
emitStateSP->nextAltFixup = pc; /* offset to next alternate */
pc += OFFSET_LEN;
emitStateSP->continueNode = t;
emitStateSP->continueOp = REOP_JUMP;
emitStateSP->jumpToJumpFlag = FALSE;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->kid;
op = t->op;
assert(op < REOP_LIMIT);
continue;
case REOP_FLAT:
/*
* Coalesce FLATs if possible and if it would not increase bytecode
* beyond preallocated limit. The latter happens only when bytecode
* size for coalesced string with offset p and length 2 exceeds 6
* bytes preallocated for 2 single char nodes, i.e. when
* 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
* GetCompactIndexWidth(p) > 4.
* Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
* nodes strictly decreases bytecode size, the check has to be
* done only for the first coalescing.
*/
if (t->kid &&
GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
{
while (t->next &&
t->next->op == REOP_FLAT &&
(WCHAR*)t->kid + t->u.flat.length ==
t->next->kid) {
t->u.flat.length += t->next->u.flat.length;
t->next = t->next->next;
}
}
if (t->kid && t->u.flat.length > 1) {
pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
pc = WriteCompactIndex(pc, t->u.flat.length);
} else if (t->u.flat.chr < 256) {
pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
*pc++ = (jsbytecode) t->u.flat.chr;
} else {
pc[-1] = (state->flags & JSREG_FOLD)
? REOP_UCFLAT1i
: REOP_UCFLAT1;
SET_ARG(pc, t->u.flat.chr);
pc += ARG_LEN;
}
break;
case REOP_LPAREN:
assert(emitStateSP);
pc = WriteCompactIndex(pc, t->u.parenIndex);
emitStateSP->continueNode = t;
emitStateSP->continueOp = REOP_RPAREN;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->kid;
op = t->op;
continue;
case REOP_RPAREN:
pc = WriteCompactIndex(pc, t->u.parenIndex);
break;
case REOP_BACKREF:
pc = WriteCompactIndex(pc, t->u.parenIndex);
break;
case REOP_ASSERT:
assert(emitStateSP);
emitStateSP->nextTermFixup = pc;
pc += OFFSET_LEN;
emitStateSP->continueNode = t;
emitStateSP->continueOp = REOP_ASSERTTEST;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->kid;
op = t->op;
continue;
case REOP_ASSERTTEST:
case REOP_ASSERTNOTTEST:
if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
goto jump_too_big;
break;
case REOP_ASSERT_NOT:
assert(emitStateSP);
emitStateSP->nextTermFixup = pc;
pc += OFFSET_LEN;
emitStateSP->continueNode = t;
emitStateSP->continueOp = REOP_ASSERTNOTTEST;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->kid;
op = t->op;
continue;
case REOP_QUANT:
assert(emitStateSP);
if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
} else if (t->u.range.min == 0 && t->u.range.max == 1) {
pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
} else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
} else {
if (!t->u.range.greedy)
pc[-1] = REOP_MINIMALQUANT;
pc = WriteCompactIndex(pc, t->u.range.min);
/*
* Write max + 1 to avoid using size_t(max) + 1 bytes
* for (UINT)-1 sentinel.
*/
pc = WriteCompactIndex(pc, t->u.range.max + 1);
}
emitStateSP->nextTermFixup = pc;
pc += OFFSET_LEN;
emitStateSP->continueNode = t;
emitStateSP->continueOp = REOP_ENDCHILD;
++emitStateSP;
assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
t = t->kid;
op = t->op;
continue;
case REOP_ENDCHILD:
if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
goto jump_too_big;
break;
case REOP_CLASS:
if (!t->u.ucclass.sense)
pc[-1] = REOP_NCLASS;
pc = WriteCompactIndex(pc, t->u.ucclass.index);
charSet = &re->classList[t->u.ucclass.index];
charSet->converted = FALSE;
charSet->length = t->u.ucclass.bmsize;
charSet->u.src.startIndex = t->u.ucclass.startIndex;
charSet->u.src.length = t->u.ucclass.kidlen;
charSet->sense = t->u.ucclass.sense;
break;
default:
break;
}
t = t->next;
if (t) {
op = t->op;
} else {
if (emitStateSP == emitStateStack)
break;
--emitStateSP;
t = emitStateSP->continueNode;
op = (REOp) emitStateSP->continueOp;
}
}
cleanup:
heap_free(emitStateStack);
return pc;
jump_too_big:
ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
pc = NULL;
goto cleanup;
}
/*
* Process the op against the two top operands, reducing them to a single
* operand in the penultimate slot. Update progLength and treeDepth.
*/
static BOOL
ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
INT operandSP)
{
RENode *result;
switch (opData->op) {
case REOP_ALT:
result = NewRENode(state, REOP_ALT);
if (!result)
return FALSE;
result->kid = operandStack[operandSP - 2];
result->u.kid2 = operandStack[operandSP - 1];
operandStack[operandSP - 2] = result;
if (state->treeDepth == TREE_DEPTH_MAX) {
ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
return FALSE;
}
++state->treeDepth;
/*
* Look at both alternates to see if there's a FLAT or a CLASS at
* the start of each. If so, use a prerequisite match.
*/
if (((RENode *) result->kid)->op == REOP_FLAT &&
((RENode *) result->u.kid2)->op == REOP_FLAT &&
(state->flags & JSREG_FOLD) == 0) {
result->op = REOP_ALTPREREQ;
result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
/* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
JUMP, <end> ... ENDALT */
state->progLength += 13;
}
else
if (((RENode *) result->kid)->op == REOP_CLASS &&
((RENode *) result->kid)->u.ucclass.index < 256 &&
((RENode *) result->u.kid2)->op == REOP_FLAT &&
(state->flags & JSREG_FOLD) == 0) {
result->op = REOP_ALTPREREQ2;
result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
/* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
JUMP, <end> ... ENDALT */
state->progLength += 13;
}
else
if (((RENode *) result->kid)->op == REOP_FLAT &&
((RENode *) result->u.kid2)->op == REOP_CLASS &&
((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
(state->flags & JSREG_FOLD) == 0) {
result->op = REOP_ALTPREREQ2;
result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
result->u.altprereq.ch2 =
((RENode *) result->u.kid2)->u.ucclass.index;
/* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
JUMP, <end> ... ENDALT */
state->progLength += 13;
}
else {
/* ALT, <next>, ..., JUMP, <end> ... ENDALT */
state->progLength += 7;
}
break;
case REOP_CONCAT:
result = operandStack[operandSP - 2];
while (result->next)
result = result->next;
result->next = operandStack[operandSP - 1];
break;
case REOP_ASSERT:
case REOP_ASSERT_NOT:
case REOP_LPARENNON:
case REOP_LPAREN:
/* These should have been processed by a close paren. */
ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
opData->errPos);
return FALSE;
default:;
}
return TRUE;
}
/*
* Hack two bits in CompilerState.flags, for use within FindParenCount to flag
* its being on the stack, and to propagate errors to its callers.
*/
#define JSREG_FIND_PAREN_COUNT 0x8000
#define JSREG_FIND_PAREN_ERROR 0x4000
/*
* Magic return value from FindParenCount and GetDecimalValue, to indicate
* overflow beyond GetDecimalValue's max parameter, or a computed maximum if
* its findMax parameter is non-null.
*/
#define OVERFLOW_VALUE ((UINT)-1)
static UINT
FindParenCount(CompilerState *state)
{
CompilerState temp;
int i;
if (state->flags & JSREG_FIND_PAREN_COUNT)
return OVERFLOW_VALUE;
/*
* Copy state into temp, flag it so we never report an invalid backref,
* and reset its members to parse the entire regexp. This is obviously
* suboptimal, but GetDecimalValue calls us only if a backref appears to
* refer to a forward parenthetical, which is rare.
*/
temp = *state;
temp.flags |= JSREG_FIND_PAREN_COUNT;
temp.cp = temp.cpbegin;
temp.parenCount = 0;
temp.classCount = 0;
temp.progLength = 0;
temp.treeDepth = 0;
temp.classBitmapsMem = 0;
for (i = 0; i < CLASS_CACHE_SIZE; i++)
temp.classCache[i].start = NULL;
if (!ParseRegExp(&temp)) {
state->flags |= JSREG_FIND_PAREN_ERROR;
return OVERFLOW_VALUE;
}
return temp.parenCount;
}
/*
* Extract and return a decimal value at state->cp. The initial character c
* has already been read. Return OVERFLOW_VALUE if the result exceeds max.
* Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
* state->flags to discover whether an error occurred under findMax.
*/
static UINT
GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
CompilerState *state)
{
UINT value = JS7_UNDEC(c);
BOOL overflow = (value > max && (!findMax || value > findMax(state)));
/* The following restriction allows simpler overflow checks. */
assert(max <= ((UINT)-1 - 9) / 10);
while (state->cp < state->cpend) {
c = *state->cp;
if (!JS7_ISDEC(c))
break;
value = 10 * value + JS7_UNDEC(c);
if (!overflow && value > max && (!findMax || value > findMax(state)))
overflow = TRUE;
++state->cp;
}
return overflow ? OVERFLOW_VALUE : value;
}
/*
* Calculate the total size of the bitmap required for a class expression.
*/
static BOOL
CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
const WCHAR *end)
{
UINT max = 0;
BOOL inRange = FALSE;
WCHAR c, rangeStart = 0;
UINT n, digit, nDigits, i;
target->u.ucclass.bmsize = 0;
target->u.ucclass.sense = TRUE;
if (src == end)
return TRUE;
if (*src == '^') {
++src;
target->u.ucclass.sense = FALSE;
}
while (src != end) {
BOOL canStartRange = TRUE;
UINT localMax = 0;
switch (*src) {
case '\\':
++src;
c = *src++;
switch (c) {
case 'b':
localMax = 0x8;
break;
case 'f':
localMax = 0xC;
break;
case 'n':
localMax = 0xA;
break;
case 'r':
localMax = 0xD;
break;
case 't':
localMax = 0x9;
break;
case 'v':
localMax = 0xB;
break;
case 'c':
if (src < end && RE_IS_LETTER(*src)) {
localMax = (UINT) (*src++) & 0x1F;
} else {
--src;
localMax = '\\';
}
break;
case 'x':
nDigits = 2;
goto lexHex;
case 'u':
nDigits = 4;
lexHex:
n = 0;
for (i = 0; (i < nDigits) && (src < end); i++) {
c = *src++;
if (!isASCIIHexDigit(c, &digit)) {
/*
* Back off to accepting the original
*'\' as a literal.
*/
src -= i + 1;
n = '\\';
break;
}
n = (n << 4) | digit;
}
localMax = n;
break;
case 'd':
canStartRange = FALSE;
if (inRange) {
JS_ReportErrorNumber(state->context,
js_GetErrorMessage, NULL,
JSMSG_BAD_CLASS_RANGE);
return FALSE;
}
localMax = '9';
break;
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
canStartRange = FALSE;
if (inRange) {
JS_ReportErrorNumber(state->context,
js_GetErrorMessage, NULL,
JSMSG_BAD_CLASS_RANGE);
return FALSE;
}
max = 65535;
/*
* If this is the start of a range, ensure that it's less than
* the end.
*/
localMax = 0;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/*
* This is a non-ECMA extension - decimal escapes (in this
* case, octal!) are supposed to be an error inside class
* ranges, but supported here for backwards compatibility.
*
*/
n = JS7_UNDEC(c);
c = *src;
if ('0' <= c && c <= '7') {
src++;
n = 8 * n + JS7_UNDEC(c);
c = *src;
if ('0' <= c && c <= '7') {
src++;
i = 8 * n + JS7_UNDEC(c);
if (i <= 0377)
n = i;
else
src--;
}
}
localMax = n;
break;
default:
localMax = c;
break;
}
break;
default:
localMax = *src++;
break;
}
if (inRange) {
/* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
if (rangeStart > localMax) {
JS_ReportErrorNumber(state->context,
js_GetErrorMessage, NULL,
JSMSG_BAD_CLASS_RANGE);
return FALSE;
}
inRange = FALSE;
} else {
if (canStartRange && src < end - 1) {
if (*src == '-') {
++src;
inRange = TRUE;
rangeStart = (WCHAR)localMax;
continue;
}
}
if (state->flags & JSREG_FOLD)
rangeStart = localMax; /* one run of the uc/dc loop below */
}
if (state->flags & JSREG_FOLD) {
WCHAR maxch = localMax;
for (i = rangeStart; i <= localMax; i++) {
WCHAR uch, dch;
uch = toupperW(i);
dch = tolowerW(i);
if(maxch < uch)
maxch = uch;
if(maxch < dch)
maxch = dch;
}
localMax = maxch;
}
if (localMax > max)
max = localMax;
}
target->u.ucclass.bmsize = max;
return TRUE;
}
static INT
ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
{
UINT min, max;
WCHAR c;
const WCHAR *errp = state->cp++;
c = *state->cp;
if (JS7_ISDEC(c)) {
++state->cp;
min = GetDecimalValue(c, 0xFFFF, NULL, state);
c = *state->cp;
if (!ignoreValues && min == OVERFLOW_VALUE)
return JSMSG_MIN_TOO_BIG;
if (c == ',') {
c = *++state->cp;
if (JS7_ISDEC(c)) {
++state->cp;
max = GetDecimalValue(c, 0xFFFF, NULL, state);
c = *state->cp;
if (!ignoreValues && max == OVERFLOW_VALUE)
return JSMSG_MAX_TOO_BIG;
if (!ignoreValues && min > max)
return JSMSG_OUT_OF_ORDER;
} else {
max = (UINT)-1;
}
} else {
max = min;
}
if (c == '}') {
state->result = NewRENode(state, REOP_QUANT);
if (!state->result)
return JSMSG_OUT_OF_MEMORY;
state->result->u.range.min = min;
state->result->u.range.max = max;
/*
* QUANT, <min>, <max>, <next> ... <ENDCHILD>
* where <max> is written as compact(max+1) to make
* (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
*/
state->progLength += (1 + GetCompactIndexWidth(min)
+ GetCompactIndexWidth(max + 1)
+3);
return 0;
}
}
state->cp = errp;
return -1;
}
static BOOL
ParseQuantifier(CompilerState *state)
{
RENode *term;
term = state->result;
if (state->cp < state->cpend) {
switch (*state->cp) {
case '+':
state->result = NewRENode(state, REOP_QUANT);
if (!state->result)
return FALSE;
state->result->u.range.min = 1;
state->result->u.range.max = (UINT)-1;
/* <PLUS>, <next> ... <ENDCHILD> */
state->progLength += 4;
goto quantifier;
case '*':
state->result = NewRENode(state, REOP_QUANT);
if (!state->result)
return FALSE;
state->result->u.range.min = 0;
state->result->u.range.max = (UINT)-1;
/* <STAR>, <next> ... <ENDCHILD> */
state->progLength += 4;
goto quantifier;
case '?':
state->result = NewRENode(state, REOP_QUANT);
if (!state->result)
return FALSE;
state->result->u.range.min = 0;
state->result->u.range.max = 1;
/* <OPT>, <next> ... <ENDCHILD> */
state->progLength += 4;
goto quantifier;
case '{': /* balance '}' */
{
INT err;
err = ParseMinMaxQuantifier(state, FALSE);
if (err == 0)
goto quantifier;
if (err == -1)
return TRUE;
ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
return FALSE;
}
default:;
}
}
return TRUE;
quantifier:
if (state->treeDepth == TREE_DEPTH_MAX) {
ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
return FALSE;
}
++state->treeDepth;
++state->cp;
state->result->kid = term;
if (state->cp < state->cpend && *state->cp == '?') {
++state->cp;
state->result->u.range.greedy = FALSE;
} else {
state->result->u.range.greedy = TRUE;
}
return TRUE;
}
/*
* item: assertion An item is either an assertion or
* quantatom a quantified atom.
*
* assertion: '^' Assertions match beginning of string
* (or line if the class static property
* RegExp.multiline is true).
* '$' End of string (or line if the class
* static property RegExp.multiline is
* true).
* '\b' Word boundary (between \w and \W).
* '\B' Word non-boundary.
*
* quantatom: atom An unquantified atom.
* quantatom '{' n ',' m '}'
* Atom must occur between n and m times.
* quantatom '{' n ',' '}' Atom must occur at least n times.
* quantatom '{' n '}' Atom must occur exactly n times.
* quantatom '*' Zero or more times (same as {0,}).
* quantatom '+' One or more times (same as {1,}).
* quantatom '?' Zero or one time (same as {0,1}).
*
* any of which can be optionally followed by '?' for ungreedy
*
* atom: '(' regexp ')' A parenthesized regexp (what matched
* can be addressed using a backreference,
* see '\' n below).
* '.' Matches any char except '\n'.
* '[' classlist ']' A character class.
* '[' '^' classlist ']' A negated character class.
* '\f' Form Feed.
* '\n' Newline (Line Feed).
* '\r' Carriage Return.
* '\t' Horizontal Tab.
* '\v' Vertical Tab.
* '\d' A digit (same as [0-9]).
* '\D' A non-digit.
* '\w' A word character, [0-9a-z_A-Z].
* '\W' A non-word character.
* '\s' A whitespace character, [ \b\f\n\r\t\v].
* '\S' A non-whitespace character.
* '\' n A backreference to the nth (n decimal
* and positive) parenthesized expression.
* '\' octal An octal escape sequence (octal must be
* two or three digits long, unless it is
* 0 for the null character).
* '\x' hex A hex escape (hex must be two digits).
* '\u' unicode A unicode escape (must be four digits).
* '\c' ctrl A control character, ctrl is a letter.
* '\' literalatomchar Any character except one of the above
* that follow '\' in an atom.
* otheratomchar Any character not first among the other
* atom right-hand sides.
*/
static BOOL
ParseTerm(CompilerState *state)
{
WCHAR c = *state->cp++;
UINT nDigits;
UINT num, tmp, n, i;
const WCHAR *termStart;
switch (c) {
/* assertions and atoms */
case '^':
state->result = NewRENode(state, REOP_BOL);
if (!state->result)
return FALSE;
state->progLength++;
return TRUE;
case '$':
state->result = NewRENode(state, REOP_EOL);
if (!state->result)
return FALSE;
state->progLength++;
return TRUE;
case '\\':
if (state->cp >= state->cpend) {
/* a trailing '\' is an error */
ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
return FALSE;
}
c = *state->cp++;
switch (c) {
/* assertion escapes */
case 'b' :
state->result = NewRENode(state, REOP_WBDRY);
if (!state->result)
return FALSE;
state->progLength++;
return TRUE;
case 'B':
state->result = NewRENode(state, REOP_WNONBDRY);
if (!state->result)
return FALSE;
state->progLength++;
return TRUE;
/* Decimal escape */
case '0':
/* Give a strict warning. See also the note below. */
WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
doOctal:
num = 0;
while (state->cp < state->cpend) {
c = *state->cp;
if (c < '0' || '7' < c)
break;
state->cp++;
tmp = 8 * num + (UINT)JS7_UNDEC(c);
if (tmp > 0377)
break;
num = tmp;
}
c = (WCHAR)num;
doFlat:
state->result = NewRENode(state, REOP_FLAT);
if (!state->result)
return FALSE;
state->result->u.flat.chr = c;
state->result->u.flat.length = 1;
state->progLength += 3;
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
termStart = state->cp - 1;
num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
if (state->flags & JSREG_FIND_PAREN_ERROR)
return FALSE;
if (num == OVERFLOW_VALUE) {
/* Give a strict mode warning. */
WARN("back-reference exceeds number of capturing parentheses\n");
/*
* Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
* error here. However, for compatibility with IE, we treat the
* whole backref as flat if the first character in it is not a
* valid octal character, and as an octal escape otherwise.
*/
state->cp = termStart;
if (c >= '8') {
/* Treat this as flat. termStart - 1 is the \. */
c = '\\';
goto asFlat;
}
/* Treat this as an octal escape. */
goto doOctal;
}
assert(1 <= num && num <= 0x10000);
state->result = NewRENode(state, REOP_BACKREF);
if (!state->result)
return FALSE;
state->result->u.parenIndex = num - 1;
state->progLength
+= 1 + GetCompactIndexWidth(state->result->u.parenIndex);
break;
/* Control escape */
case 'f':
c = 0xC;
goto doFlat;
case 'n':
c = 0xA;
goto doFlat;
case 'r':
c = 0xD;
goto doFlat;
case 't':
c = 0x9;
goto doFlat;
case 'v':
c = 0xB;
goto doFlat;
/* Control letter */
case 'c':
if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
c = (WCHAR) (*state->cp++ & 0x1F);
} else {
/* back off to accepting the original '\' as a literal */
--state->cp;
c = '\\';
}
goto doFlat;
/* HexEscapeSequence */
case 'x':
nDigits = 2;
goto lexHex;
/* UnicodeEscapeSequence */
case 'u':
nDigits = 4;
lexHex:
n = 0;
for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
UINT digit;
c = *state->cp++;
if (!isASCIIHexDigit(c, &digit)) {
/*
* Back off to accepting the original 'u' or 'x' as a
* literal.
*/
state->cp -= i + 2;
n = *state->cp++;
break;
}
n = (n << 4) | digit;
}
c = (WCHAR) n;
goto doFlat;
/* Character class escapes */
case 'd':
state->result = NewRENode(state, REOP_DIGIT);
doSimple:
if (!state->result)
return FALSE;
state->progLength++;
break;
case 'D':
state->result = NewRENode(state, REOP_NONDIGIT);
goto doSimple;
case 's':
state->result = NewRENode(state, REOP_SPACE);
goto doSimple;
case 'S':
state->result = NewRENode(state, REOP_NONSPACE);
goto doSimple;
case 'w':
state->result = NewRENode(state, REOP_ALNUM);
goto doSimple;
case 'W':
state->result = NewRENode(state, REOP_NONALNUM);
goto doSimple;
/* IdentityEscape */
default:
state->result = NewRENode(state, REOP_FLAT);
if (!state->result)
return FALSE;
state->result->u.flat.chr = c;
state->result->u.flat.length = 1;
state->result->kid = (void *) (state->cp - 1);
state->progLength += 3;
break;
}
break;
case '[':
state->result = NewRENode(state, REOP_CLASS);
if (!state->result)
return FALSE;
termStart = state->cp;
state->result->u.ucclass.startIndex = termStart - state->cpbegin;
for (;;) {
if (state->cp == state->cpend) {
ReportRegExpErrorHelper(state, JSREPORT_ERROR,
JSMSG_UNTERM_CLASS, termStart);
return FALSE;
}
if (*state->cp == '\\') {
state->cp++;
if (state->cp != state->cpend)
state->cp++;
continue;
}
if (*state->cp == ']') {
state->result->u.ucclass.kidlen = state->cp - termStart;
break;
}
state->cp++;
}
for (i = 0; i < CLASS_CACHE_SIZE; i++) {
if (!state->classCache[i].start) {
state->classCache[i].start = termStart;
state->classCache[i].length = state->result->u.ucclass.kidlen;
state->classCache[i].index = state->classCount;
break;
}
if (state->classCache[i].length ==
state->result->u.ucclass.kidlen) {
for (n = 0; ; n++) {
if (n == state->classCache[i].length) {
state->result->u.ucclass.index
= state->classCache[i].index;
goto claim;
}
if (state->classCache[i].start[n] != termStart[n])
break;
}
}
}
state->result->u.ucclass.index = state->classCount++;
claim:
/*
* Call CalculateBitmapSize now as we want any errors it finds
* to be reported during the parse phase, not at execution.
*/
if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
return FALSE;
/*
* Update classBitmapsMem with number of bytes to hold bmsize bits,
* which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
* or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
*/
n = (state->result->u.ucclass.bmsize >> 3) + 1;
if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
return FALSE;
}
state->classBitmapsMem += n;
/* CLASS, <index> */
state->progLength
+= 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
break;
case '.':
state->result = NewRENode(state, REOP_DOT);
goto doSimple;
case '{':
{
const WCHAR *errp = state->cp--;
INT err;
err = ParseMinMaxQuantifier(state, TRUE);
state->cp = errp;
if (err < 0)
goto asFlat;
/* FALL THROUGH */
}
case '*':
case '+':
case '?':
ReportRegExpErrorHelper(state, JSREPORT_ERROR,
JSMSG_BAD_QUANTIFIER, state->cp - 1);
return FALSE;
default:
asFlat:
state->result = NewRENode(state, REOP_FLAT);
if (!state->result)
return FALSE;
state->result->u.flat.chr = c;
state->result->u.flat.length = 1;
state->result->kid = (void *) (state->cp - 1);
state->progLength += 3;
break;
}
return ParseQuantifier(state);
}
/*
* Top-down regular expression grammar, based closely on Perl4.
*
* regexp: altern A regular expression is one or more
* altern '|' regexp alternatives separated by vertical bar.
*/
#define INITIAL_STACK_SIZE 128
static BOOL
ParseRegExp(CompilerState *state)
{
size_t parenIndex;
RENode *operand;
REOpData *operatorStack;
RENode **operandStack;
REOp op;
INT i;
BOOL result = FALSE;
INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
/* Watch out for empty regexp */
if (state->cp == state->cpend) {
state->result = NewRENode(state, REOP_EMPTY);
return (state->result != NULL);
}
operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
if (!operatorStack)
return FALSE;
operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
if (!operandStack)
goto out;
for (;;) {
parenIndex = state->parenCount;
if (state->cp == state->cpend) {
/*
* If we are at the end of the regexp and we're short one or more
* operands, the regexp must have the form /x|/ or some such, with
* left parentheses making us short more than one operand.
*/
if (operatorSP >= operandSP) {
operand = NewRENode(state, REOP_EMPTY);
if (!operand)
goto out;
goto pushOperand;
}
} else {
switch (*state->cp) {
case '(':
++state->cp;
if (state->cp + 1 < state->cpend &&
*state->cp == '?' &&
(state->cp[1] == '=' ||
state->cp[1] == '!' ||
state->cp[1] == ':')) {
switch (state->cp[1]) {
case '=':
op = REOP_ASSERT;
/* ASSERT, <next>, ... ASSERTTEST */
state->progLength += 4;
break;
case '!':
op = REOP_ASSERT_NOT;
/* ASSERTNOT, <next>, ... ASSERTNOTTEST */
state->progLength += 4;
break;
default:
op = REOP_LPARENNON;
break;
}
state->cp += 2;
} else {
op = REOP_LPAREN;
/* LPAREN, <index>, ... RPAREN, <index> */
state->progLength
+= 2 * (1 + GetCompactIndexWidth(parenIndex));
state->parenCount++;
if (state->parenCount == 65535) {
ReportRegExpError(state, JSREPORT_ERROR,
JSMSG_TOO_MANY_PARENS);
goto out;
}
}
goto pushOperator;
case ')':
/*
* If there's no stacked open parenthesis, throw syntax error.
*/
for (i = operatorSP - 1; ; i--) {
if (i < 0) {
ReportRegExpError(state, JSREPORT_ERROR,
JSMSG_UNMATCHED_RIGHT_PAREN);
goto out;
}
if (operatorStack[i].op == REOP_ASSERT ||
operatorStack[i].op == REOP_ASSERT_NOT ||
operatorStack[i].op == REOP_LPARENNON ||
operatorStack[i].op == REOP_LPAREN) {
break;
}
}
/* FALL THROUGH */
case '|':
/* Expected an operand before these, so make an empty one */
operand = NewRENode(state, REOP_EMPTY);
if (!operand)
goto out;
goto pushOperand;
default:
if (!ParseTerm(state))
goto out;
operand = state->result;
pushOperand:
if (operandSP == operandStackSize) {
RENode **tmp;
operandStackSize += operandStackSize;
tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
if (!tmp)
goto out;
operandStack = tmp;
}
operandStack[operandSP++] = operand;
break;
}
}
/* At the end; process remaining operators. */
restartOperator:
if (state->cp == state->cpend) {
while (operatorSP) {
--operatorSP;
if (!ProcessOp(state, &operatorStack[operatorSP],
operandStack, operandSP))
goto out;
--operandSP;
}
assert(operandSP == 1);
state->result = operandStack[0];
result = TRUE;
goto out;
}
switch (*state->cp) {
case '|':
/* Process any stacked 'concat' operators */
++state->cp;
while (operatorSP &&
operatorStack[operatorSP - 1].op == REOP_CONCAT) {
--operatorSP;
if (!ProcessOp(state, &operatorStack[operatorSP],
operandStack, operandSP)) {
goto out;
}
--operandSP;
}
op = REOP_ALT;
goto pushOperator;
case ')':
/*
* If there's no stacked open parenthesis, throw syntax error.
*/
for (i = operatorSP - 1; ; i--) {
if (i < 0) {
ReportRegExpError(state, JSREPORT_ERROR,
JSMSG_UNMATCHED_RIGHT_PAREN);
goto out;
}
if (operatorStack[i].op == REOP_ASSERT ||
operatorStack[i].op == REOP_ASSERT_NOT ||
operatorStack[i].op == REOP_LPARENNON ||
operatorStack[i].op == REOP_LPAREN) {
break;
}
}
++state->cp;
/* Process everything on the stack until the open parenthesis. */
for (;;) {
assert(operatorSP);
--operatorSP;
switch (operatorStack[operatorSP].op) {
case REOP_ASSERT:
case REOP_ASSERT_NOT:
case REOP_LPAREN:
operand = NewRENode(state, operatorStack[operatorSP].op);
if (!operand)
goto out;
operand->u.parenIndex =
operatorStack[operatorSP].parenIndex;
assert(operandSP);
operand->kid = operandStack[operandSP - 1];
operandStack[operandSP - 1] = operand;
if (state->treeDepth == TREE_DEPTH_MAX) {
ReportRegExpError(state, JSREPORT_ERROR,
JSMSG_REGEXP_TOO_COMPLEX);
goto out;
}
++state->treeDepth;
/* FALL THROUGH */
case REOP_LPARENNON:
state->result = operandStack[operandSP - 1];
if (!ParseQuantifier(state))
goto out;
operandStack[operandSP - 1] = state->result;
goto restartOperator;
default:
if (!ProcessOp(state, &operatorStack[operatorSP],
operandStack, operandSP))
goto out;
--operandSP;
break;
}
}
break;
case '{':
{
const WCHAR *errp = state->cp;
if (ParseMinMaxQuantifier(state, TRUE) < 0) {
/*
* This didn't even scan correctly as a quantifier, so we should
* treat it as flat.
*/
op = REOP_CONCAT;
goto pushOperator;
}
state->cp = errp;
/* FALL THROUGH */
}
case '+':
case '*':
case '?':
ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
state->cp);
result = FALSE;
goto out;
default:
/* Anything else is the start of the next term. */
op = REOP_CONCAT;
pushOperator:
if (operatorSP == operatorStackSize) {
REOpData *tmp;
operatorStackSize += operatorStackSize;
tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
if (!tmp)
goto out;
operatorStack = tmp;
}
operatorStack[operatorSP].op = op;
operatorStack[operatorSP].errPos = state->cp;
operatorStack[operatorSP++].parenIndex = parenIndex;
break;
}
}
out:
heap_free(operatorStack);
heap_free(operandStack);
return result;
}
/*
* Save the current state of the match - the position in the input
* text as well as the position in the bytecode. The state of any
* parent expressions is also saved (preceding state).
* Contents of parenCount parentheses from parenIndex are also saved.
*/
static REBackTrackData *
PushBackTrackState(REGlobalData *gData, REOp op,
jsbytecode *target, REMatchState *x, const WCHAR *cp,
size_t parenIndex, size_t parenCount)
{
size_t i;
REBackTrackData *result =
(REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
size_t sz = sizeof(REBackTrackData) +
gData->stateStackTop * sizeof(REProgState) +
parenCount * sizeof(RECapture);
ptrdiff_t btsize = gData->backTrackStackSize;
ptrdiff_t btincr = ((char *)result + sz) -
((char *)gData->backTrackStack + btsize);
TRACE("\tBT_Push: %lu,%lu\n", (unsigned long) parenIndex, (unsigned long) parenCount);
JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
if (btincr > 0) {
ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
btincr = ((btincr+btsize-1)/btsize)*btsize;
gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
if (!gData->backTrackStack) {
js_ReportOutOfScriptQuota(gData->cx);
gData->ok = FALSE;
return NULL;
}
gData->backTrackStackSize = btsize + btincr;
result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
}
gData->backTrackSP = result;
result->sz = gData->cursz;
gData->cursz = sz;
result->backtrack_op = op;
result->backtrack_pc = target;
result->cp = cp;
result->parenCount = parenCount;
result->parenIndex = parenIndex;
result->saveStateStackTop = gData->stateStackTop;
assert(gData->stateStackTop);
memcpy(result + 1, gData->stateStack,
sizeof(REProgState) * result->saveStateStackTop);
if (parenCount != 0) {
memcpy((char *)(result + 1) +
sizeof(REProgState) * result->saveStateStackTop,
&x->parens[parenIndex],
sizeof(RECapture) * parenCount);
for (i = 0; i != parenCount; i++)
x->parens[parenIndex + i].index = -1;
}
return result;
}
static inline REMatchState *
FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
size_t length)
{
size_t i;
assert(gData->cpend >= x->cp);
if (length > (size_t)(gData->cpend - x->cp))
return NULL;
for (i = 0; i != length; i++) {
if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
return NULL;
}
x->cp += length;
return x;
}
/*
* 1. Evaluate DecimalEscape to obtain an EscapeValue E.
* 2. If E is not a character then go to step 6.
* 3. Let ch be E's character.
* 4. Let A be a one-element RECharSet containing the character ch.
* 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
* 6. E must be an integer. Let n be that integer.
* 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
* 8. Return an internal Matcher closure that takes two arguments, a State x
* and a Continuation c, and performs the following:
* 1. Let cap be x's captures internal array.
* 2. Let s be cap[n].
* 3. If s is undefined, then call c(x) and return its result.
* 4. Let e be x's endIndex.
* 5. Let len be s's length.
* 6. Let f be e+len.
* 7. If f>InputLength, return failure.
* 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
* such that Canonicalize(s[i]) is not the same character as
* Canonicalize(Input [e+i]), then return failure.
* 9. Let y be the State (f, cap).
* 10. Call c(y) and return its result.
*/
static REMatchState *
BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
{
size_t len, i;
const WCHAR *parenContent;
RECapture *cap = &x->parens[parenIndex];
if (cap->index == -1)
return x;
len = cap->length;
if (x->cp + len > gData->cpend)
return NULL;
parenContent = &gData->cpbegin[cap->index];
if (gData->regexp->flags & JSREG_FOLD) {
for (i = 0; i < len; i++) {
if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
return NULL;
}
} else {
for (i = 0; i < len; i++) {
if (parenContent[i] != x->cp[i])
return NULL;
}
}
x->cp += len;
return x;
}
/* Add a single character to the RECharSet */
static void
AddCharacterToCharSet(RECharSet *cs, WCHAR c)
{
UINT byteIndex = (UINT)(c >> 3);
assert(c <= cs->length);
cs->u.bits[byteIndex] |= 1 << (c & 0x7);
}
/* Add a character range, c1 to c2 (inclusive) to the RECharSet */
static void
AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
{
UINT i;
UINT byteIndex1 = c1 >> 3;
UINT byteIndex2 = c2 >> 3;
assert(c2 <= cs->length && c1 <= c2);
c1 &= 0x7;
c2 &= 0x7;
if (byteIndex1 == byteIndex2) {
cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
} else {
cs->u.bits[byteIndex1] |= 0xFF << c1;
for (i = byteIndex1 + 1; i < byteIndex2; i++)
cs->u.bits[i] = 0xFF;
cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
}
}
/* Compile the source of the class into a RECharSet */
static BOOL
ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
{
const WCHAR *src, *end;
BOOL inRange = FALSE;
WCHAR rangeStart = 0;
UINT byteLength, n;
WCHAR c, thisCh;
INT nDigits, i;
assert(!charSet->converted);
/*
* Assert that startIndex and length points to chars inside [] inside
* source string.
*/
assert(1 <= charSet->u.src.startIndex);
assert(charSet->u.src.startIndex
< SysStringLen(gData->regexp->source));
assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
- 1 - charSet->u.src.startIndex);
charSet->converted = TRUE;
src = gData->regexp->source + charSet->u.src.startIndex;
end = src + charSet->u.src.length;
assert(src[-1] == '[' && end[0] == ']');
byteLength = (charSet->length >> 3) + 1;
charSet->u.bits = heap_alloc(byteLength);
if (!charSet->u.bits) {
JS_ReportOutOfMemory(gData->cx);
gData->ok = FALSE;
return FALSE;
}
memset(charSet->u.bits, 0, byteLength);
if (src == end)
return TRUE;
if (*src == '^') {
assert(charSet->sense == FALSE);
++src;
} else {
assert(charSet->sense == TRUE);
}
while (src != end) {
switch (*src) {
case '\\':
++src;
c = *src++;
switch (c) {
case 'b':
thisCh = 0x8;
break;
case 'f':
thisCh = 0xC;
break;
case 'n':
thisCh = 0xA;
break;
case 'r':
thisCh = 0xD;
break;
case 't':
thisCh = 0x9;
break;
case 'v':
thisCh = 0xB;
break;
case 'c':
if (src < end && JS_ISWORD(*src)) {
thisCh = (WCHAR)(*src++ & 0x1F);
} else {
--src;
thisCh = '\\';
}
break;
case 'x':
nDigits = 2;
goto lexHex;
case 'u':
nDigits = 4;
lexHex:
n = 0;
for (i = 0; (i < nDigits) && (src < end); i++) {
UINT digit;
c = *src++;
if (!isASCIIHexDigit(c, &digit)) {
/*
* Back off to accepting the original '\'
* as a literal
*/
src -= i + 1;
n = '\\';
break;
}
n = (n << 4) | digit;
}
thisCh = (WCHAR)n;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/*
* This is a non-ECMA extension - decimal escapes (in this
* case, octal!) are supposed to be an error inside class
* ranges, but supported here for backwards compatibility.
*/
n = JS7_UNDEC(c);
c = *src;
if ('0' <= c && c <= '7') {
src++;
n = 8 * n + JS7_UNDEC(c);
c = *src;
if ('0' <= c && c <= '7') {
src++;
i = 8 * n + JS7_UNDEC(c);
if (i <= 0377)
n = i;
else
src--;
}
}
thisCh = (WCHAR)n;
break;
case 'd':
AddCharacterRangeToCharSet(charSet, '0', '9');
continue; /* don't need range processing */
case 'D':
AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
AddCharacterRangeToCharSet(charSet,
(WCHAR)('9' + 1),
(WCHAR)charSet->length);
continue;
case 's':
for (i = (INT)charSet->length; i >= 0; i--)
if (isspaceW(i))
AddCharacterToCharSet(charSet, (WCHAR)i);
continue;
case 'S':
for (i = (INT)charSet->length; i >= 0; i--)
if (!isspaceW(i))
AddCharacterToCharSet(charSet, (WCHAR)i);
continue;
case 'w':
for (i = (INT)charSet->length; i >= 0; i--)
if (JS_ISWORD(i))
AddCharacterToCharSet(charSet, (WCHAR)i);
continue;
case 'W':
for (i = (INT)charSet->length; i >= 0; i--)
if (!JS_ISWORD(i))
AddCharacterToCharSet(charSet, (WCHAR)i);
continue;
default:
thisCh = c;
break;
}
break;
default:
thisCh = *src++;
break;
}
if (inRange) {
if (gData->regexp->flags & JSREG_FOLD) {
int i;
assert(rangeStart <= thisCh);
for (i = rangeStart; i <= thisCh; i++) {
WCHAR uch, dch;
AddCharacterToCharSet(charSet, i);
uch = toupperW(i);
dch = tolowerW(i);
if (i != uch)
AddCharacterToCharSet(charSet, uch);
if (i != dch)
AddCharacterToCharSet(charSet, dch);
}
} else {
AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
}
inRange = FALSE;
} else {
if (gData->regexp->flags & JSREG_FOLD) {
AddCharacterToCharSet(charSet, toupperW(thisCh));
AddCharacterToCharSet(charSet, tolowerW(thisCh));
} else {
AddCharacterToCharSet(charSet, thisCh);
}
if (src < end - 1) {
if (*src == '-') {
++src;
inRange = TRUE;
rangeStart = thisCh;
}
}
}
}
return TRUE;
}
static BOOL
ReallocStateStack(REGlobalData *gData)
{
size_t limit = gData->stateStackLimit;
size_t sz = sizeof(REProgState) * limit;
gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
if (!gData->stateStack) {
js_ReportOutOfScriptQuota(gData->cx);
gData->ok = FALSE;
return FALSE;
}
gData->stateStackLimit = limit + limit;
return TRUE;
}
#define PUSH_STATE_STACK(data) \
do { \
++(data)->stateStackTop; \
if ((data)->stateStackTop == (data)->stateStackLimit && \
!ReallocStateStack((data))) { \
return NULL; \
} \
}while(0)
/*
* Apply the current op against the given input to see if it's going to match
* or fail. Return false if we don't get a match, true if we do. If updatecp is
* true, then update the current state's cp. Always update startpc to the next
* op.
*/
static inline REMatchState *
SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
jsbytecode **startpc, BOOL updatecp)
{
REMatchState *result = NULL;
WCHAR matchCh;
size_t parenIndex;
size_t offset, length, index;
jsbytecode *pc = *startpc; /* pc has already been incremented past op */
WCHAR *source;
const WCHAR *startcp = x->cp;
WCHAR ch;
RECharSet *charSet;
const char *opname = reop_names[op];
TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
(int)gData->stateStackTop * 2, "", opname);
switch (op) {
case REOP_EMPTY:
result = x;
break;
case REOP_BOL:
if (x->cp != gData->cpbegin) {
if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
!(gData->regexp->flags & JSREG_MULTILINE)) {
break;
}
if (!RE_IS_LINE_TERM(x->cp[-1]))
break;
}
result = x;
break;
case REOP_EOL:
if (x->cp != gData->cpend) {
if (/*!gData->cx->regExpStatics.multiline &&*/
!(gData->regexp->flags & JSREG_MULTILINE)) {
break;
}
if (!RE_IS_LINE_TERM(*x->cp))
break;
}
result = x;
break;
case REOP_WBDRY:
if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
!(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
result = x;
}
break;
case REOP_WNONBDRY:
if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
result = x;
}
break;
case REOP_DOT:
if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_DIGIT:
if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_NONDIGIT:
if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_ALNUM:
if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_NONALNUM:
if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_SPACE:
if (x->cp != gData->cpend && isspaceW(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_NONSPACE:
if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
result = x;
result->cp++;
}
break;
case REOP_BACKREF:
pc = ReadCompactIndex(pc, &parenIndex);
assert(parenIndex < gData->regexp->parenCount);
result = BackrefMatcher(gData, x, parenIndex);
break;
case REOP_FLAT:
pc = ReadCompactIndex(pc, &offset);
assert(offset < SysStringLen(gData->regexp->source));
pc = ReadCompactIndex(pc, &length);
assert(1 <= length);
assert(length <= SysStringLen(gData->regexp->source) - offset);
if (length <= (size_t)(gData->cpend - x->cp)) {
source = gData->regexp->source + offset;
TRACE("%s\n", debugstr_wn(source, length));
for (index = 0; index != length; index++) {
if (source[index] != x->cp[index])
return NULL;
}
x->cp += length;
result = x;
}
break;
case REOP_FLAT1:
matchCh = *pc++;
TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
if (x->cp != gData->cpend && *x->cp == matchCh) {
result = x;
result->cp++;
}
break;
case REOP_FLATi:
pc = ReadCompactIndex(pc, &offset);
assert(offset < SysStringLen(gData->regexp->source));
pc = ReadCompactIndex(pc, &length);
assert(1 <= length);
assert(length <= SysStringLen(gData->regexp->source) - offset);
source = gData->regexp->source;
result = FlatNIMatcher(gData, x, source + offset, length);
break;
case REOP_FLAT1i:
matchCh = *pc++;
if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
result = x;
result->cp++;
}
break;
case REOP_UCFLAT1:
matchCh = GET_ARG(pc);
TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
pc += ARG_LEN;
if (x->cp != gData->cpend && *x->cp == matchCh) {
result = x;
result->cp++;
}
break;
case REOP_UCFLAT1i:
matchCh = GET_ARG(pc);
pc += ARG_LEN;
if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
result = x;
result->cp++;
}
break;
case REOP_CLASS:
pc = ReadCompactIndex(pc, &index);
assert(index < gData->regexp->classCount);
if (x->cp != gData->cpend) {
charSet = &gData->regexp->classList[index];
assert(charSet->converted);
ch = *x->cp;
index = ch >> 3;
if (charSet->length != 0 &&
ch <= charSet->length &&
(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
result = x;
result->cp++;
}
}
break;
case REOP_NCLASS:
pc = ReadCompactIndex(pc, &index);
assert(index < gData->regexp->classCount);
if (x->cp != gData->cpend) {
charSet = &gData->regexp->classList[index];
assert(charSet->converted);
ch = *x->cp;
index = ch >> 3;
if (charSet->length == 0 ||
ch > charSet->length ||
!(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
result = x;
result->cp++;
}
}
break;
default:
assert(FALSE);
}
if (result) {
if (!updatecp)
x->cp = startcp;
*startpc = pc;
TRACE(" *\n");
return result;
}
x->cp = startcp;
return NULL;
}
static inline REMatchState *
ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
{
REMatchState *result = NULL;
REBackTrackData *backTrackData;
jsbytecode *nextpc, *testpc;
REOp nextop;
RECapture *cap;
REProgState *curState;
const WCHAR *startcp;
size_t parenIndex, k;
size_t parenSoFar = 0;
WCHAR matchCh1, matchCh2;
RECharSet *charSet;
BOOL anchor;
jsbytecode *pc = gData->regexp->program;
REOp op = (REOp) *pc++;
/*
* If the first node is a simple match, step the index into the string
* until that match is made, or fail if it can't be found at all.
*/
if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
anchor = FALSE;
while (x->cp <= gData->cpend) {
nextpc = pc; /* reset back to start each time */
result = SimpleMatch(gData, x, op, &nextpc, TRUE);
if (result) {
anchor = TRUE;
x = result;
pc = nextpc; /* accept skip to next opcode */
op = (REOp) *pc++;
assert(op < REOP_LIMIT);
break;
}
gData->skipped++;
x->cp++;
}
if (!anchor)
goto bad;
}
for (;;) {
const char *opname = reop_names[op];
TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
(int)gData->stateStackTop * 2, "", opname);
if (REOP_IS_SIMPLE(op)) {
result = SimpleMatch(gData, x, op, &pc, TRUE);
} else {
curState = &gData->stateStack[gData->stateStackTop];
switch (op) {
case REOP_END:
goto good;
case REOP_ALTPREREQ2:
nextpc = pc + GET_OFFSET(pc); /* start of next op */
pc += ARG_LEN;
matchCh2 = GET_ARG(pc);
pc += ARG_LEN;
k = GET_ARG(pc);
pc += ARG_LEN;
if (x->cp != gData->cpend) {
if (*x->cp == matchCh2)
goto doAlt;
charSet = &gData->regexp->classList[k];
if (!charSet->converted && !ProcessCharSet(gData, charSet))
goto bad;
matchCh1 = *x->cp;
k = matchCh1 >> 3;
if ((charSet->length == 0 ||
matchCh1 > charSet->length ||
!(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
charSet->sense) {
goto doAlt;
}
}
result = NULL;
break;
case REOP_ALTPREREQ:
nextpc = pc + GET_OFFSET(pc); /* start of next op */
pc += ARG_LEN;
matchCh1 = GET_ARG(pc);
pc += ARG_LEN;
matchCh2 = GET_ARG(pc);
pc += ARG_LEN;
if (x->cp == gData->cpend ||
(*x->cp != matchCh1 && *x->cp != matchCh2)) {
result = NULL;
break;
}
/* else false thru... */
case REOP_ALT:
doAlt:
nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
pc += ARG_LEN; /* start of this alternate */
curState->parenSoFar = parenSoFar;
PUSH_STATE_STACK(gData);
op = (REOp) *pc++;
startcp = x->cp;
if (REOP_IS_SIMPLE(op)) {
if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
op = (REOp) *nextpc++;
pc = nextpc;
continue;
}
result = x;
op = (REOp) *pc++;
}
nextop = (REOp) *nextpc++;
if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
goto bad;
continue;
/*
* Occurs at (successful) end of REOP_ALT,
*/
case REOP_JUMP:
/*
* If we have not gotten a result here, it is because of an
* empty match. Do the same thing REOP_EMPTY would do.
*/
if (!result)
result = x;
--gData->stateStackTop;
pc += GET_OFFSET(pc);
op = (REOp) *pc++;
continue;
/*
* Occurs at last (successful) end of REOP_ALT,
*/
case REOP_ENDALT:
/*
* If we have not gotten a result here, it is because of an
* empty match. Do the same thing REOP_EMPTY would do.
*/
if (!result)
result = x;
--gData->stateStackTop;
op = (REOp) *pc++;
continue;
case REOP_LPAREN:
pc = ReadCompactIndex(pc, &parenIndex);
TRACE("[ %lu ]\n", (unsigned long) parenIndex);
assert(parenIndex < gData->regexp->parenCount);
if (parenIndex + 1 > parenSoFar)
parenSoFar = parenIndex + 1;
x->parens[parenIndex].index = x->cp - gData->cpbegin;
x->parens[parenIndex].length = 0;
op = (REOp) *pc++;
continue;
case REOP_RPAREN:
{
ptrdiff_t delta;
pc = ReadCompactIndex(pc, &parenIndex);
assert(parenIndex < gData->regexp->parenCount);
cap = &x->parens[parenIndex];
delta = x->cp - (gData->cpbegin + cap->index);
cap->length = (delta < 0) ? 0 : (size_t) delta;
op = (REOp) *pc++;
if (!result)
result = x;
continue;
}
case REOP_ASSERT:
nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
pc += ARG_LEN; /* start of ASSERT child */
op = (REOp) *pc++;
testpc = pc;
if (REOP_IS_SIMPLE(op) &&
!SimpleMatch(gData, x, op, &testpc, FALSE)) {
result = NULL;
break;
}
curState->u.assertion.top =
(char *)gData->backTrackSP - (char *)gData->backTrackStack;
curState->u.assertion.sz = gData->cursz;
curState->index = x->cp - gData->cpbegin;
curState->parenSoFar = parenSoFar;
PUSH_STATE_STACK(gData);
if (!PushBackTrackState(gData, REOP_ASSERTTEST,
nextpc, x, x->cp, 0, 0)) {
goto bad;
}
continue;
case REOP_ASSERT_NOT:
nextpc = pc + GET_OFFSET(pc);
pc += ARG_LEN;
op = (REOp) *pc++;
testpc = pc;
if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
SimpleMatch(gData, x, op, &testpc, FALSE) &&
*testpc == REOP_ASSERTNOTTEST) {
result = NULL;
break;
}
curState->u.assertion.top
= (char *)gData->backTrackSP -
(char *)gData->backTrackStack;
curState->u.assertion.sz = gData->cursz;
curState->index = x->cp - gData->cpbegin;
curState->parenSoFar = parenSoFar;
PUSH_STATE_STACK(gData);
if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
nextpc, x, x->cp, 0, 0)) {
goto bad;
}
continue;
case REOP_ASSERTTEST:
--gData->stateStackTop;
--curState;
x->cp = gData->cpbegin + curState->index;
gData->backTrackSP =
(REBackTrackData *) ((char *)gData->backTrackStack +
curState->u.assertion.top);
gData->cursz = curState->u.assertion.sz;
if (result)
result = x;
break;
case REOP_ASSERTNOTTEST:
--gData->stateStackTop;
--curState;
x->cp = gData->cpbegin + curState->index;
gData->backTrackSP =
(REBackTrackData *) ((char *)gData->backTrackStack +
curState->u.assertion.top);
gData->cursz = curState->u.assertion.sz;
result = (!result) ? x : NULL;
break;
case REOP_STAR:
curState->u.quantifier.min = 0;
curState->u.quantifier.max = (UINT)-1;
goto quantcommon;
case REOP_PLUS:
curState->u.quantifier.min = 1;
curState->u.quantifier.max = (UINT)-1;
goto quantcommon;
case REOP_OPT:
curState->u.quantifier.min = 0;
curState->u.quantifier.max = 1;
goto quantcommon;
case REOP_QUANT:
pc = ReadCompactIndex(pc, &k);
curState->u.quantifier.min = k;
pc = ReadCompactIndex(pc, &k);
/* max is k - 1 to use one byte for (UINT)-1 sentinel. */
curState->u.quantifier.max = k - 1;
assert(curState->u.quantifier.min <= curState->u.quantifier.max);
quantcommon:
if (curState->u.quantifier.max == 0) {
pc = pc + GET_OFFSET(pc);
op = (REOp) *pc++;
result = x;
continue;
}
/* Step over <next> */
nextpc = pc + ARG_LEN;
op = (REOp) *nextpc++;
startcp = x->cp;
if (REOP_IS_SIMPLE(op)) {
if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
if (curState->u.quantifier.min == 0)
result = x;
else
result = NULL;
pc = pc + GET_OFFSET(pc);
break;
}
op = (REOp) *nextpc++;
result = x;
}
curState->index = startcp - gData->cpbegin;
curState->continue_op = REOP_REPEAT;
curState->continue_pc = pc;
curState->parenSoFar = parenSoFar;
PUSH_STATE_STACK(gData);
if (curState->u.quantifier.min == 0 &&
!PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
0, 0)) {
goto bad;
}
pc = nextpc;
continue;
case REOP_ENDCHILD: /* marks the end of a quantifier child */
pc = curState[-1].continue_pc;
op = (REOp) curState[-1].continue_op;
if (!result)
result = x;
continue;
case REOP_REPEAT:
--curState;
do {
--gData->stateStackTop;
if (!result) {
/* Failed, see if we have enough children. */
if (curState->u.quantifier.min == 0)
goto repeatDone;
goto break_switch;
}
if (curState->u.quantifier.min == 0 &&
x->cp == gData->cpbegin + curState->index) {
/* matched an empty string, that'll get us nowhere */
result = NULL;
goto break_switch;
}
if (curState->u.quantifier.min != 0)
curState->u.quantifier.min--;
if (curState->u.quantifier.max != (UINT) -1)
curState->u.quantifier.max--;
if (curState->u.quantifier.max == 0)
goto repeatDone;
nextpc = pc + ARG_LEN;
nextop = (REOp) *nextpc;
startcp = x->cp;
if (REOP_IS_SIMPLE(nextop)) {
nextpc++;
if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
if (curState->u.quantifier.min == 0)
goto repeatDone;
result = NULL;
goto break_switch;
}
result = x;
}
curState->index = startcp - gData->cpbegin;
PUSH_STATE_STACK(gData);
if (curState->u.quantifier.min == 0 &&
!PushBackTrackState(gData, REOP_REPEAT,
pc, x, startcp,
curState->parenSoFar,
parenSoFar -
curState->parenSoFar)) {
goto bad;
}
} while (*nextpc == REOP_ENDCHILD);
pc = nextpc;
op = (REOp) *pc++;
parenSoFar = curState->parenSoFar;
continue;
repeatDone:
result = x;
pc += GET_OFFSET(pc);
goto break_switch;
case REOP_MINIMALSTAR:
curState->u.quantifier.min = 0;
curState->u.quantifier.max = (UINT)-1;
goto minimalquantcommon;
case REOP_MINIMALPLUS:
curState->u.quantifier.min = 1;
curState->u.quantifier.max = (UINT)-1;
goto minimalquantcommon;
case REOP_MINIMALOPT:
curState->u.quantifier.min = 0;
curState->u.quantifier.max = 1;
goto minimalquantcommon;
case REOP_MINIMALQUANT:
pc = ReadCompactIndex(pc, &k);
curState->u.quantifier.min = k;
pc = ReadCompactIndex(pc, &k);
/* See REOP_QUANT comments about k - 1. */
curState->u.quantifier.max = k - 1;
assert(curState->u.quantifier.min
<= curState->u.quantifier.max);
minimalquantcommon:
curState->index = x->cp - gData->cpbegin;
curState->parenSoFar = parenSoFar;
PUSH_STATE_STACK(gData);
if (curState->u.quantifier.min != 0) {
curState->continue_op = REOP_MINIMALREPEAT;
curState->continue_pc = pc;
/* step over <next> */
pc += OFFSET_LEN;
op = (REOp) *pc++;
} else {
if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
pc, x, x->cp, 0, 0)) {
goto bad;
}
--gData->stateStackTop;
pc = pc + GET_OFFSET(pc);
op = (REOp) *pc++;
}
continue;
case REOP_MINIMALREPEAT:
--gData->stateStackTop;
--curState;
TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
#define PREPARE_REPEAT() \
do { \
curState->index = x->cp - gData->cpbegin; \
curState->continue_op = REOP_MINIMALREPEAT; \
curState->continue_pc = pc; \
pc += ARG_LEN; \
for (k = curState->parenSoFar; k < parenSoFar; k++) \
x->parens[k].index = -1; \
PUSH_STATE_STACK(gData); \
op = (REOp) *pc++; \
assert(op < REOP_LIMIT); \
}while(0)
if (!result) {
TRACE(" -\n");
/*
* Non-greedy failure - try to consume another child.
*/
if (curState->u.quantifier.max == (UINT) -1 ||
curState->u.quantifier.max > 0) {
PREPARE_REPEAT();
continue;
}
/* Don't need to adjust pc since we're going to pop. */
break;
}
if (curState->u.quantifier.min == 0 &&
x->cp == gData->cpbegin + curState->index) {
/* Matched an empty string, that'll get us nowhere. */
result = NULL;
break;
}
if (curState->u.quantifier.min != 0)
curState->u.quantifier.min--;
if (curState->u.quantifier.max != (UINT) -1)
curState->u.quantifier.max--;
if (curState->u.quantifier.min != 0) {
PREPARE_REPEAT();
continue;
}
curState->index = x->cp - gData->cpbegin;
curState->parenSoFar = parenSoFar;
PUSH_STATE_STACK(gData);
if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
pc, x, x->cp,
curState->parenSoFar,
parenSoFar - curState->parenSoFar)) {
goto bad;
}
--gData->stateStackTop;
pc = pc + GET_OFFSET(pc);
op = (REOp) *pc++;
assert(op < REOP_LIMIT);
continue;
default:
assert(FALSE);
result = NULL;
}
break_switch:;
}
/*
* If the match failed and there's a backtrack option, take it.
* Otherwise this is a complete and utter failure.
*/
if (!result) {
if (gData->cursz == 0)
return NULL;
/* Potentially detect explosive regex here. */
gData->backTrackCount++;
if (gData->backTrackLimit &&
gData->backTrackCount >= gData->backTrackLimit) {
JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
JSMSG_REGEXP_TOO_COMPLEX);
gData->ok = FALSE;
return NULL;
}
backTrackData = gData->backTrackSP;
gData->cursz = backTrackData->sz;
gData->backTrackSP =
(REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
x->cp = backTrackData->cp;
pc = backTrackData->backtrack_pc;
op = (REOp) backTrackData->backtrack_op;
assert(op < REOP_LIMIT);
gData->stateStackTop = backTrackData->saveStateStackTop;
assert(gData->stateStackTop);
memcpy(gData->stateStack, backTrackData + 1,
sizeof(REProgState) * backTrackData->saveStateStackTop);
curState = &gData->stateStack[gData->stateStackTop - 1];
if (backTrackData->parenCount) {
memcpy(&x->parens[backTrackData->parenIndex],
(char *)(backTrackData + 1) +
sizeof(REProgState) * backTrackData->saveStateStackTop,
sizeof(RECapture) * backTrackData->parenCount);
parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
} else {
for (k = curState->parenSoFar; k < parenSoFar; k++)
x->parens[k].index = -1;
parenSoFar = curState->parenSoFar;
}
TRACE("\tBT_Pop: %ld,%ld\n",
(unsigned long) backTrackData->parenIndex,
(unsigned long) backTrackData->parenCount);
continue;
}
x = result;
/*
* Continue with the expression.
*/
op = (REOp)*pc++;
assert(op < REOP_LIMIT);
}
bad:
TRACE("\n");
return NULL;
good:
TRACE("\n");
return x;
}
static REMatchState *MatchRegExp(REGlobalData *gData, REMatchState *x)
{
REMatchState *result;
const WCHAR *cp = x->cp;
const WCHAR *cp2;
UINT j;
/*
* Have to include the position beyond the last character
* in order to detect end-of-input/line condition.
*/
for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
gData->skipped = cp2 - cp;
x->cp = cp2;
for (j = 0; j < gData->regexp->parenCount; j++)
x->parens[j].index = -1;
result = ExecuteREBytecode(gData, x);
if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY))
return result;
gData->backTrackSP = gData->backTrackStack;
gData->cursz = 0;
gData->stateStackTop = 0;
cp2 = cp + gData->skipped;
}
return NULL;
}
#define MIN_BACKTRACK_LIMIT 400000
static REMatchState *InitMatch(script_ctx_t *cx, REGlobalData *gData, JSRegExp *re, size_t length)
{
REMatchState *result;
UINT i;
gData->backTrackStackSize = INITIAL_BACKTRACK;
gData->backTrackStack = jsheap_alloc(gData->pool, INITIAL_BACKTRACK);
if (!gData->backTrackStack)
goto bad;
gData->backTrackSP = gData->backTrackStack;
gData->cursz = 0;
gData->backTrackCount = 0;
gData->backTrackLimit = 0;
gData->stateStackLimit = INITIAL_STATESTACK;
gData->stateStack = jsheap_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
if (!gData->stateStack)
goto bad;
gData->stateStackTop = 0;
gData->cx = cx;
gData->regexp = re;
gData->ok = TRUE;
result = jsheap_alloc(gData->pool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture));
if (!result)
goto bad;
for (i = 0; i < re->classCount; i++) {
if (!re->classList[i].converted &&
!ProcessCharSet(gData, &re->classList[i])) {
return NULL;
}
}
return result;
bad:
js_ReportOutOfScriptQuota(cx);
gData->ok = FALSE;
return NULL;
}
static void
js_DestroyRegExp(JSRegExp *re)
{
if (re->classList) {
UINT i;
for (i = 0; i < re->classCount; i++) {
if (re->classList[i].converted)
heap_free(re->classList[i].u.bits);
re->classList[i].u.bits = NULL;
}
heap_free(re->classList);
}
heap_free(re);
}
static JSRegExp *
js_NewRegExp(script_ctx_t *cx, BSTR str, UINT flags, BOOL flat)
{
JSRegExp *re;
jsheap_t *mark;
CompilerState state;
size_t resize;
jsbytecode *endPC;
UINT i;
size_t len;
re = NULL;
mark = jsheap_mark(&cx->tmp_heap);
len = SysStringLen(str);
state.context = cx;
state.cp = str;
if (!state.cp)
goto out;
state.cpbegin = state.cp;
state.cpend = state.cp + len;
state.flags = flags;
state.parenCount = 0;
state.classCount = 0;
state.progLength = 0;
state.treeDepth = 0;
state.classBitmapsMem = 0;
for (i = 0; i < CLASS_CACHE_SIZE; i++)
state.classCache[i].start = NULL;
if (len != 0 && flat) {
state.result = NewRENode(&state, REOP_FLAT);
if (!state.result)
goto out;
state.result->u.flat.chr = *state.cpbegin;
state.result->u.flat.length = len;
state.result->kid = (void *) state.cpbegin;
/* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
state.progLength += 1 + GetCompactIndexWidth(0)
+ GetCompactIndexWidth(len);
} else {
if (!ParseRegExp(&state))
goto out;
}
resize = offsetof(JSRegExp, program) + state.progLength + 1;
re = heap_alloc(resize);
if (!re)
goto out;
assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
re->classCount = state.classCount;
if (re->classCount) {
re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
if (!re->classList) {
js_DestroyRegExp(re);
re = NULL;
goto out;
}
for (i = 0; i < re->classCount; i++)
re->classList[i].converted = FALSE;
} else {
re->classList = NULL;
}
endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
if (!endPC) {
js_DestroyRegExp(re);
re = NULL;
goto out;
}
*endPC++ = REOP_END;
/*
* Check whether size was overestimated and shrink using realloc.
* This is safe since no pointers to newly parsed regexp or its parts
* besides re exist here.
*/
if ((size_t)(endPC - re->program) != state.progLength + 1) {
JSRegExp *tmp;
assert((size_t)(endPC - re->program) < state.progLength + 1);
resize = offsetof(JSRegExp, program) + (endPC - re->program);
tmp = heap_realloc(re, resize);
if (tmp)
re = tmp;
}
re->flags = flags;
re->parenCount = state.parenCount;
re->source = str;
out:
jsheap_clear(mark);
return re;
}
static inline RegExpInstance *regexp_from_vdisp(vdisp_t *vdisp)
{
return (RegExpInstance*)vdisp->u.jsdisp;
}
static void set_last_index(RegExpInstance *This, DWORD last_index)
{
This->last_index = last_index;
VariantClear(&This->last_index_var);
num_set_val(&This->last_index_var, last_index);
}
static HRESULT do_regexp_match_next(script_ctx_t *ctx, RegExpInstance *regexp, DWORD rem_flags,
const WCHAR *str, DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size,
DWORD *parens_cnt, match_result_t *ret)
{
REMatchState *x, *result;
REGlobalData gData;
DWORD matchlen;
gData.cpbegin = *cp;
gData.cpend = str + len;
gData.start = *cp-str;
gData.skipped = 0;
gData.pool = &ctx->tmp_heap;
x = InitMatch(NULL, &gData, regexp->jsregexp, gData.cpend - gData.cpbegin);
if(!x) {
WARN("InitMatch failed\n");
return E_FAIL;
}
x->cp = *cp;
result = MatchRegExp(&gData, x);
if(!gData.ok) {
WARN("MatchRegExp failed\n");
return E_FAIL;
}
if(!result) {
if(rem_flags & REM_RESET_INDEX)
set_last_index(regexp, 0);
return S_FALSE;
}
if(parens) {
DWORD i;
if(regexp->jsregexp->parenCount > *parens_size) {
match_result_t *new_parens;
if(*parens)
new_parens = heap_realloc(*parens, sizeof(match_result_t)*regexp->jsregexp->parenCount);
else
new_parens = heap_alloc(sizeof(match_result_t)*regexp->jsregexp->parenCount);
if(!new_parens)
return E_OUTOFMEMORY;
*parens = new_parens;
}
*parens_cnt = regexp->jsregexp->parenCount;
for(i=0; i < regexp->jsregexp->parenCount; i++) {
if(result->parens[i].index == -1) {
(*parens)[i].str = NULL;
(*parens)[i].len = 0;
}else {
(*parens)[i].str = *cp + result->parens[i].index;
(*parens)[i].len = result->parens[i].length;
}
}
}
matchlen = (result->cp-*cp) - gData.skipped;
*cp = result->cp;
ret->str = result->cp-matchlen;
ret->len = matchlen;
set_last_index(regexp, result->cp-str);
return S_OK;
}
HRESULT regexp_match_next(script_ctx_t *ctx, DispatchEx *dispex, DWORD rem_flags, const WCHAR *str,
DWORD len, const WCHAR **cp, match_result_t **parens, DWORD *parens_size, DWORD *parens_cnt,
match_result_t *ret)
{
RegExpInstance *regexp = (RegExpInstance*)dispex;
jsheap_t *mark;
HRESULT hres;
if((rem_flags & REM_CHECK_GLOBAL) && !(regexp->jsregexp->flags & JSREG_GLOB))
return S_FALSE;
mark = jsheap_mark(&ctx->tmp_heap);
hres = do_regexp_match_next(ctx, regexp, rem_flags, str, len, cp, parens, parens_size, parens_cnt, ret);
jsheap_clear(mark);
return hres;
}
HRESULT regexp_match(script_ctx_t *ctx, DispatchEx *dispex, const WCHAR *str, DWORD len, BOOL gflag,
match_result_t **match_result, DWORD *result_cnt)
{
RegExpInstance *This = (RegExpInstance*)dispex;
match_result_t *ret = NULL, cres;
const WCHAR *cp = str;
DWORD i=0, ret_size = 0;
jsheap_t *mark;
HRESULT hres;
mark = jsheap_mark(&ctx->tmp_heap);
while(1) {
hres = do_regexp_match_next(ctx, This, 0, str, len, &cp, NULL, NULL, NULL, &cres);
if(hres == S_FALSE) {
hres = S_OK;
break;
}
if(FAILED(hres))
break;
if(ret_size == i) {
if(ret)
ret = heap_realloc(ret, (ret_size <<= 1) * sizeof(match_result_t));
else
ret = heap_alloc((ret_size=4) * sizeof(match_result_t));
if(!ret) {
hres = E_OUTOFMEMORY;
break;
}
}
ret[i++] = cres;
if(!gflag && !(This->jsregexp->flags & JSREG_GLOB)) {
hres = S_OK;
break;
}
}
jsheap_clear(mark);
if(FAILED(hres)) {
heap_free(ret);
return hres;
}
*match_result = ret;
*result_cnt = i;
return S_OK;
}
static HRESULT RegExp_source(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
TRACE("\n");
switch(flags) {
case DISPATCH_PROPERTYGET: {
RegExpInstance *This = regexp_from_vdisp(jsthis);
V_VT(retv) = VT_BSTR;
V_BSTR(retv) = SysAllocString(This->str);
if(!V_BSTR(retv))
return E_OUTOFMEMORY;
break;
}
default:
FIXME("Unimplemnted flags %x\n", flags);
return E_NOTIMPL;
}
return S_OK;
}
static HRESULT RegExp_global(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
FIXME("\n");
return E_NOTIMPL;
}
static HRESULT RegExp_ignoreCase(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
FIXME("\n");
return E_NOTIMPL;
}
static HRESULT RegExp_multiline(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
FIXME("\n");
return E_NOTIMPL;
}
static INT index_from_var(script_ctx_t *ctx, VARIANT *v)
{
jsexcept_t ei;
VARIANT num;
HRESULT hres;
memset(&ei, 0, sizeof(ei));
hres = to_number(ctx, v, &ei, &num);
if(FAILED(hres)) { /* FIXME: Move ignoring exceptions to to_promitive */
VariantClear(&ei.var);
return 0;
}
if(V_VT(&num) == VT_R8) {
DOUBLE d = floor(V_R8(&num));
return (DOUBLE)(INT)d == d ? d : 0;
}
return V_I4(&num);
}
static HRESULT RegExp_lastIndex(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
TRACE("\n");
switch(flags) {
case DISPATCH_PROPERTYGET: {
RegExpInstance *regexp = regexp_from_vdisp(jsthis);
V_VT(retv) = VT_EMPTY;
return VariantCopy(retv, &regexp->last_index_var);
}
case DISPATCH_PROPERTYPUT: {
RegExpInstance *regexp = regexp_from_vdisp(jsthis);
VARIANT *arg;
HRESULT hres;
arg = get_arg(dp,0);
hres = VariantCopy(&regexp->last_index_var, arg);
if(FAILED(hres))
return hres;
regexp->last_index = index_from_var(ctx, arg);
break;
}
default:
FIXME("unimplemented flags: %x\n", flags);
return E_NOTIMPL;
}
return S_OK;
}
static HRESULT RegExp_toString(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
FIXME("\n");
return E_NOTIMPL;
}
static HRESULT create_match_array(script_ctx_t *ctx, BSTR input, const match_result_t *result,
const match_result_t *parens, DWORD parens_cnt, jsexcept_t *ei, IDispatch **ret)
{
DispatchEx *array;
VARIANT var;
int i;
HRESULT hres = S_OK;
static const WCHAR indexW[] = {'i','n','d','e','x',0};
static const WCHAR inputW[] = {'i','n','p','u','t',0};
static const WCHAR zeroW[] = {'0',0};
hres = create_array(ctx, parens_cnt+1, &array);
if(FAILED(hres))
return hres;
for(i=0; i < parens_cnt; i++) {
V_VT(&var) = VT_BSTR;
V_BSTR(&var) = SysAllocStringLen(parens[i].str, parens[i].len);
if(!V_BSTR(&var)) {
hres = E_OUTOFMEMORY;
break;
}
hres = jsdisp_propput_idx(array, i+1, &var, ei, NULL/*FIXME*/);
SysFreeString(V_BSTR(&var));
if(FAILED(hres))
break;
}
while(SUCCEEDED(hres)) {
V_VT(&var) = VT_I4;
V_I4(&var) = result->str-input;
hres = jsdisp_propput_name(array, indexW, &var, ei, NULL/*FIXME*/);
if(FAILED(hres))
break;
V_VT(&var) = VT_BSTR;
V_BSTR(&var) = input;
hres = jsdisp_propput_name(array, inputW, &var, ei, NULL/*FIXME*/);
if(FAILED(hres))
break;
V_BSTR(&var) = SysAllocStringLen(result->str, result->len);
if(!V_BSTR(&var)) {
hres = E_OUTOFMEMORY;
break;
}
hres = jsdisp_propput_name(array, zeroW, &var, ei, NULL/*FIXME*/);
SysFreeString(V_BSTR(&var));
break;
}
if(FAILED(hres)) {
jsdisp_release(array);
return hres;
}
*ret = (IDispatch*)_IDispatchEx_(array);
return S_OK;
}
static HRESULT run_exec(script_ctx_t *ctx, vdisp_t *jsthis, VARIANT *arg, jsexcept_t *ei, BSTR *input,
match_result_t *match, match_result_t **parens, DWORD *parens_cnt, VARIANT_BOOL *ret)
{
RegExpInstance *regexp;
DWORD parens_size = 0, last_index = 0, length;
const WCHAR *cp;
BSTR string;
HRESULT hres;
if(!is_vclass(jsthis, JSCLASS_REGEXP)) {
FIXME("Not a RegExp\n");
return E_NOTIMPL;
}
regexp = regexp_from_vdisp(jsthis);
if(arg) {
hres = to_string(ctx, arg, ei, &string);
if(FAILED(hres))
return hres;
}else {
string = SysAllocStringLen(NULL, 0);
if(!string)
return E_OUTOFMEMORY;
}
if(regexp->last_index < 0) {
SysFreeString(string);
set_last_index(regexp, 0);
*ret = VARIANT_FALSE;
if(input) {
*input = NULL;
}
return S_OK;
}
length = SysStringLen(string);
if(regexp->jsregexp->flags & JSREG_GLOB)
last_index = regexp->last_index;
cp = string + last_index;
hres = regexp_match_next(ctx, &regexp->dispex, REM_RESET_INDEX, string, length, &cp, parens,
parens ? &parens_size : NULL, parens_cnt, match);
if(FAILED(hres)) {
SysFreeString(string);
return hres;
}
*ret = hres == S_OK ? VARIANT_TRUE : VARIANT_FALSE;
if(input) {
*input = string;
}else {
SysFreeString(string);
}
return S_OK;
}
static HRESULT RegExp_exec(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
match_result_t *parens = NULL, match;
DWORD parens_cnt = 0;
VARIANT_BOOL b;
BSTR string;
HRESULT hres;
TRACE("\n");
hres = run_exec(ctx, jsthis, arg_cnt(dp) ? get_arg(dp,0) : NULL, ei, &string, &match, &parens, &parens_cnt, &b);
if(FAILED(hres))
return hres;
if(retv) {
if(b) {
IDispatch *ret;
hres = create_match_array(ctx, string, &match, parens, parens_cnt, ei, &ret);
if(SUCCEEDED(hres)) {
V_VT(retv) = VT_DISPATCH;
V_DISPATCH(retv) = ret;
}
}else {
V_VT(retv) = VT_NULL;
}
}
heap_free(parens);
SysFreeString(string);
return hres;
}
static HRESULT RegExp_test(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
match_result_t match;
VARIANT_BOOL b;
HRESULT hres;
TRACE("\n");
hres = run_exec(ctx, jsthis, arg_cnt(dp) ? get_arg(dp,0) : NULL, ei, NULL, &match, NULL, NULL, &b);
if(FAILED(hres))
return hres;
if(retv) {
V_VT(retv) = VT_BOOL;
V_BOOL(retv) = b;
}
return S_OK;
}
static HRESULT RegExp_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
TRACE("\n");
switch(flags) {
case INVOKE_FUNC:
return throw_type_error(ctx, ei, IDS_NOT_FUNC, NULL);
default:
FIXME("unimplemented flags %x\n", flags);
return E_NOTIMPL;
}
return S_OK;
}
static void RegExp_destructor(DispatchEx *dispex)
{
RegExpInstance *This = (RegExpInstance*)dispex;
if(This->jsregexp)
js_DestroyRegExp(This->jsregexp);
VariantClear(&This->last_index_var);
SysFreeString(This->str);
heap_free(This);
}
static const builtin_prop_t RegExp_props[] = {
{execW, RegExp_exec, PROPF_METHOD|1},
{globalW, RegExp_global, 0},
{ignoreCaseW, RegExp_ignoreCase, 0},
{lastIndexW, RegExp_lastIndex, 0},
{multilineW, RegExp_multiline, 0},
{sourceW, RegExp_source, 0},
{testW, RegExp_test, PROPF_METHOD|1},
{toStringW, RegExp_toString, PROPF_METHOD}
};
static const builtin_info_t RegExp_info = {
JSCLASS_REGEXP,
{NULL, RegExp_value, 0},
sizeof(RegExp_props)/sizeof(*RegExp_props),
RegExp_props,
RegExp_destructor,
NULL
};
static HRESULT alloc_regexp(script_ctx_t *ctx, DispatchEx *object_prototype, RegExpInstance **ret)
{
RegExpInstance *regexp;
HRESULT hres;
regexp = heap_alloc_zero(sizeof(RegExpInstance));
if(!regexp)
return E_OUTOFMEMORY;
if(object_prototype)
hres = init_dispex(&regexp->dispex, ctx, &RegExp_info, object_prototype);
else
hres = init_dispex_from_constr(&regexp->dispex, ctx, &RegExp_info, ctx->regexp_constr);
if(FAILED(hres)) {
heap_free(regexp);
return hres;
}
*ret = regexp;
return S_OK;
}
HRESULT create_regexp(script_ctx_t *ctx, const WCHAR *exp, int len, DWORD flags, DispatchEx **ret)
{
RegExpInstance *regexp;
HRESULT hres;
TRACE("%s %x\n", debugstr_w(exp), flags);
hres = alloc_regexp(ctx, NULL, &regexp);
if(FAILED(hres))
return hres;
if(len == -1)
regexp->str = SysAllocString(exp);
else
regexp->str = SysAllocStringLen(exp, len);
if(!regexp->str) {
jsdisp_release(&regexp->dispex);
return E_OUTOFMEMORY;
}
regexp->jsregexp = js_NewRegExp(ctx, regexp->str, flags, FALSE);
if(!regexp->jsregexp) {
WARN("js_NewRegExp failed\n");
jsdisp_release(&regexp->dispex);
return E_FAIL;
}
V_VT(&regexp->last_index_var) = VT_I4;
V_I4(&regexp->last_index_var) = 0;
*ret = &regexp->dispex;
return S_OK;
}
HRESULT create_regexp_var(script_ctx_t *ctx, VARIANT *src_arg, VARIANT *flags_arg, DispatchEx **ret)
{
const WCHAR *opt = emptyW, *src;
DWORD flags;
HRESULT hres;
if(V_VT(src_arg) == VT_DISPATCH) {
DispatchEx *obj;
obj = iface_to_jsdisp((IUnknown*)V_DISPATCH(src_arg));
if(obj) {
if(is_class(obj, JSCLASS_REGEXP)) {
RegExpInstance *regexp = (RegExpInstance*)obj;
hres = create_regexp(ctx, regexp->str, -1, regexp->jsregexp->flags, ret);
jsdisp_release(obj);
return hres;
}
jsdisp_release(obj);
}
}
if(V_VT(src_arg) != VT_BSTR) {
FIXME("flags_arg = %s\n", debugstr_variant(flags_arg));
return E_NOTIMPL;
}
src = V_BSTR(src_arg);
if(flags_arg) {
if(V_VT(flags_arg) != VT_BSTR) {
FIXME("unimplemented for vt %d\n", V_VT(flags_arg));
return E_NOTIMPL;
}
opt = V_BSTR(flags_arg);
}
hres = parse_regexp_flags(opt, strlenW(opt), &flags);
if(FAILED(hres))
return hres;
return create_regexp(ctx, src, -1, flags, ret);
}
static HRESULT RegExpConstr_value(script_ctx_t *ctx, vdisp_t *jsthis, WORD flags, DISPPARAMS *dp,
VARIANT *retv, jsexcept_t *ei, IServiceProvider *sp)
{
TRACE("\n");
switch(flags) {
case DISPATCH_METHOD:
if(arg_cnt(dp)) {
VARIANT *arg = get_arg(dp,0);
if(V_VT(arg) == VT_DISPATCH) {
DispatchEx *jsdisp = iface_to_jsdisp((IUnknown*)V_DISPATCH(arg));
if(jsdisp) {
if(is_class(jsdisp, JSCLASS_REGEXP)) {
if(arg_cnt(dp) > 1 && V_VT(get_arg(dp,1)) != VT_EMPTY) {
jsdisp_release(jsdisp);
return throw_regexp_error(ctx, ei, IDS_REGEXP_SYNTAX_ERROR, NULL);
}
if(retv) {
V_VT(retv) = VT_DISPATCH;
V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(jsdisp);
}else {
jsdisp_release(jsdisp);
}
return S_OK;
}
jsdisp_release(jsdisp);
}
}
}
/* fall through */
case DISPATCH_CONSTRUCT: {
DispatchEx *ret;
HRESULT hres;
if(!arg_cnt(dp)) {
FIXME("no args\n");
return E_NOTIMPL;
}
hres = create_regexp_var(ctx, get_arg(dp,0), arg_cnt(dp) > 1 ? get_arg(dp,1) : NULL, &ret);
if(FAILED(hres))
return hres;
if(retv) {
V_VT(retv) = VT_DISPATCH;
V_DISPATCH(retv) = (IDispatch*)_IDispatchEx_(ret);
}else {
jsdisp_release(ret);
}
return S_OK;
}
default:
FIXME("unimplemented flags: %x\n", flags);
return E_NOTIMPL;
}
return S_OK;
}
HRESULT create_regexp_constr(script_ctx_t *ctx, DispatchEx *object_prototype, DispatchEx **ret)
{
RegExpInstance *regexp;
HRESULT hres;
static const WCHAR RegExpW[] = {'R','e','g','E','x','p',0};
hres = alloc_regexp(ctx, object_prototype, &regexp);
if(FAILED(hres))
return hres;
hres = create_builtin_function(ctx, RegExpConstr_value, RegExpW, NULL,
PROPF_CONSTR|2, &regexp->dispex, ret);
jsdisp_release(&regexp->dispex);
return hres;
}
HRESULT parse_regexp_flags(const WCHAR *str, DWORD str_len, DWORD *ret)
{
const WCHAR *p;
DWORD flags = 0;
for (p = str; p < str+str_len; p++) {
switch (*p) {
case 'g':
flags |= JSREG_GLOB;
break;
case 'i':
flags |= JSREG_FOLD;
break;
case 'm':
flags |= JSREG_MULTILINE;
break;
case 'y':
flags |= JSREG_STICKY;
break;
default:
WARN("wrong flag %c\n", *p);
return E_FAIL;
}
}
*ret = flags;
return S_OK;
}