/**********************************************************************
regcomp.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
* Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "regparse.h"
OnigAmbigType OnigDefaultAmbigFlag =
(ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE);
extern OnigAmbigType
onig_get_default_ambig_flag(void)
{
return OnigDefaultAmbigFlag;
}
extern int
onig_set_default_ambig_flag(OnigAmbigType ambig_flag)
{
OnigDefaultAmbigFlag = ambig_flag;
return 0;
}
#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
#endif
static UChar*
k_strdup(UChar* s, UChar* end)
{
int len = end - s;
if (len > 0) {
UChar* r = (UChar* )xmalloc(len + 1);
CHECK_NULL_RETURN(r);
xmemcpy(r, s, len);
r[len] = (UChar )0;
return r;
}
else return NULL;
}
/*
Caution: node should not be a string node.
(s and end member address break)
*/
static void
swap_node(Node* a, Node* b)
{
Node c;
c = *a; *a = *b; *b = c;
}
static OnigDistance
distance_add(OnigDistance d1, OnigDistance d2)
{
if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
return ONIG_INFINITE_DISTANCE;
else {
if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
else return ONIG_INFINITE_DISTANCE;
}
}
static OnigDistance
distance_multiply(OnigDistance d, int m)
{
if (m == 0) return 0;
if (d < ONIG_INFINITE_DISTANCE / m)
return d * m;
else
return ONIG_INFINITE_DISTANCE;
}
static int
bitset_is_empty(BitSetRef bs)
{
int i;
for (i = 0; i < BITSET_SIZE; i++) {
if (bs[i] != 0) return 0;
}
return 1;
}
#ifdef ONIG_DEBUG
static int
bitset_on_num(BitSetRef bs)
{
int i, n;
n = 0;
for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
if (BITSET_AT(bs, i)) n++;
}
return n;
}
#endif
extern int
onig_bbuf_init(BBuf* buf, int size)
{
buf->p = (UChar* )xmalloc(size);
if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
buf->alloc = size;
buf->used = 0;
return 0;
}
#ifdef USE_SUBEXP_CALL
static int
unset_addr_list_init(UnsetAddrList* uslist, int size)
{
UnsetAddr* p;
p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
uslist->num = 0;
uslist->alloc = size;
uslist->us = p;
return 0;
}
static void
unset_addr_list_end(UnsetAddrList* uslist)
{
if (IS_NOT_NULL(uslist->us))
xfree(uslist->us);
}
static int
unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
{
UnsetAddr* p;
int size;
if (uslist->num >= uslist->alloc) {
size = uslist->alloc * 2;
p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
uslist->alloc = size;
uslist->us = p;
}
uslist->us[uslist->num].offset = offset;
uslist->us[uslist->num].target = node;
uslist->num++;
return 0;
}
#endif /* USE_SUBEXP_CALL */
static int
add_opcode(regex_t* reg, int opcode)
{
BBUF_ADD1(reg, opcode);
return 0;
}
static int
add_rel_addr(regex_t* reg, int addr)
{
RelAddrType ra = (RelAddrType )addr;
BBUF_ADD(reg, &ra, SIZE_RELADDR);
return 0;
}
static int
add_abs_addr(regex_t* reg, int addr)
{
AbsAddrType ra = (AbsAddrType )addr;
BBUF_ADD(reg, &ra, SIZE_ABSADDR);
return 0;
}
static int
add_length(regex_t* reg, int len)
{
LengthType l = (LengthType )len;
BBUF_ADD(reg, &l, SIZE_LENGTH);
return 0;
}
static int
add_mem_num(regex_t* reg, int num)
{
MemNumType n = (MemNumType )num;
BBUF_ADD(reg, &n, SIZE_MEMNUM);
return 0;
}
static int
add_pointer(regex_t* reg, void* addr)
{
PointerType ptr = (PointerType )addr;
BBUF_ADD(reg, &ptr, SIZE_POINTER);
return 0;
}
static int
add_option(regex_t* reg, OnigOptionType option)
{
BBUF_ADD(reg, &option, SIZE_OPTION);
return 0;
}
static int
add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
{
int r;
r = add_opcode(reg, opcode);
if (r) return r;
r = add_rel_addr(reg, addr);
return r;
}
static int
add_bytes(regex_t* reg, UChar* bytes, int len)
{
BBUF_ADD(reg, bytes, len);
return 0;
}
static int
add_bitset(regex_t* reg, BitSetRef bs)
{
BBUF_ADD(reg, bs, SIZE_BITSET);
return 0;
}
static int
add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
{
int r;
r = add_opcode(reg, opcode);
if (r) return r;
r = add_option(reg, option);
return r;
}
static int compile_length_tree(Node* node, regex_t* reg);
static int compile_tree(Node* node, regex_t* reg);
#define IS_NEED_STR_LEN_OP_EXACT(op) \
((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
(op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
static int
select_str_opcode(int mb_len, int str_len, int ignore_case)
{
int op;
if (ignore_case) {
switch (str_len) {
case 1: op = OP_EXACT1_IC; break;
default: op = OP_EXACTN_IC; break;
}
}
else {
switch (mb_len) {
case 1:
switch (str_len) {
case 1: op = OP_EXACT1; break;
case 2: op = OP_EXACT2; break;
case 3: op = OP_EXACT3; break;
case 4: op = OP_EXACT4; break;
case 5: op = OP_EXACT5; break;
default: op = OP_EXACTN; break;
}
break;
case 2:
switch (str_len) {
case 1: op = OP_EXACTMB2N1; break;
case 2: op = OP_EXACTMB2N2; break;
case 3: op = OP_EXACTMB2N3; break;
default: op = OP_EXACTMB2N; break;
}
break;
case 3:
op = OP_EXACTMB3N;
break;
default:
op = OP_EXACTMBN;
break;
}
}
return op;
}
static int
compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
{
int r;
int saved_num_null_check = reg->num_null_check;
if (empty_info != 0) {
r = add_opcode(reg, OP_NULL_CHECK_START);
if (r) return r;
r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
if (r) return r;
reg->num_null_check++;
}
r = compile_tree(node, reg);
if (r) return r;
if (empty_info != 0) {
if (empty_info == NQ_TARGET_IS_EMPTY)
r = add_opcode(reg, OP_NULL_CHECK_END);
else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
if (r) return r;
r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
}
return r;
}
#ifdef USE_SUBEXP_CALL
static int
compile_call(CallNode* node, regex_t* reg)
{
int r;
r = add_opcode(reg, OP_CALL);
if (r) return r;
r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
node->target);
if (r) return r;
r = add_abs_addr(reg, 0 /*dummy addr.*/);
return r;
}
#endif
static int
compile_tree_n_times(Node* node, int n, regex_t* reg)
{
int i, r;
for (i = 0; i < n; i++) {
r = compile_tree(node, reg);
if (r) return r;
}
return 0;
}
static int
add_compile_string_length(UChar* s, int mb_len, int str_len,
regex_t* reg, int ignore_case)
{
int len;
int op = select_str_opcode(mb_len, str_len, ignore_case);
len = SIZE_OPCODE;
if (op == OP_EXACTMBN) len += SIZE_LENGTH;
if (IS_NEED_STR_LEN_OP_EXACT(op))
len += SIZE_LENGTH;
len += mb_len * str_len;
return len;
}
static int
add_compile_string(UChar* s, int mb_len, int str_len,
regex_t* reg, int ignore_case)
{
int op = select_str_opcode(mb_len, str_len, ignore_case);
add_opcode(reg, op);
if (op == OP_EXACTMBN)
add_length(reg, mb_len);
if (IS_NEED_STR_LEN_OP_EXACT(op)) {
if (op == OP_EXACTN_IC)
add_length(reg, mb_len * str_len);
else
add_length(reg, str_len);
}
add_bytes(reg, s, mb_len * str_len);
return 0;
}
static int
compile_length_string_node(Node* node, regex_t* reg)
{
int rlen, r, len, prev_len, slen, ambig;
OnigEncoding enc = reg->enc;
UChar *p, *prev;
StrNode* sn;
sn = &(NSTRING(node));
if (sn->end <= sn->s)
return 0;
ambig = NSTRING_IS_AMBIG(node);
p = prev = sn->s;
prev_len = enc_len(enc, p);
p += prev_len;
slen = 1;
rlen = 0;
for (; p < sn->end; ) {
len = enc_len(enc, p);
if (len == prev_len) {
slen++;
}
else {
r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
rlen += r;
prev = p;
slen = 1;
prev_len = len;
}
p += len;
}
r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
rlen += r;
return rlen;
}
static int
compile_length_string_raw_node(StrNode* sn, regex_t* reg)
{
if (sn->end <= sn->s)
return 0;
return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
}
static int
compile_string_node(Node* node, regex_t* reg)
{
int r, len, prev_len, slen, ambig;
OnigEncoding enc = reg->enc;
UChar *p, *prev, *end;
StrNode* sn;
sn = &(NSTRING(node));
if (sn->end <= sn->s)
return 0;
end = sn->end;
ambig = NSTRING_IS_AMBIG(node);
p = prev = sn->s;
prev_len = enc_len(enc, p);
p += prev_len;
slen = 1;
for (; p < end; ) {
len = enc_len(enc, p);
if (len == prev_len) {
slen++;
}
else {
r = add_compile_string(prev, prev_len, slen, reg, ambig);
if (r) return r;
prev = p;
slen = 1;
prev_len = len;
}
p += len;
}
return add_compile_string(prev, prev_len, slen, reg, ambig);
}
static int
compile_string_raw_node(StrNode* sn, regex_t* reg)
{
if (sn->end <= sn->s)
return 0;
return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
}
static int
add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
{
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
add_length(reg, mbuf->used);
return add_bytes(reg, mbuf->p, mbuf->used);
#else
int r, pad_size;
UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
GET_ALIGNMENT_PAD_SIZE(p, pad_size);
add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
r = add_bytes(reg, mbuf->p, mbuf->used);
/* padding for return value from compile_length_cclass_node() to be fix. */
pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
return r;
#endif
}
static int
compile_length_cclass_node(CClassNode* cc, regex_t* reg)
{
int len;
if (IS_CCLASS_SHARE(cc)) {
len = SIZE_OPCODE + SIZE_POINTER;
return len;
}
if (IS_NULL(cc->mbuf)) {
len = SIZE_OPCODE + SIZE_BITSET;
}
else {
if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
len = SIZE_OPCODE;
}
else {
len = SIZE_OPCODE + SIZE_BITSET;
}
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
len += SIZE_LENGTH + cc->mbuf->used;
#else
len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
#endif
}
return len;
}
static int
compile_cclass_node(CClassNode* cc, regex_t* reg)
{
int r;
if (IS_CCLASS_SHARE(cc)) {
add_opcode(reg, OP_CCLASS_NODE);
r = add_pointer(reg, cc);
return r;
}
if (IS_NULL(cc->mbuf)) {
if (IS_CCLASS_NOT(cc))
add_opcode(reg, OP_CCLASS_NOT);
else
add_opcode(reg, OP_CCLASS);
r = add_bitset(reg, cc->bs);
}
else {
if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
if (IS_CCLASS_NOT(cc))
add_opcode(reg, OP_CCLASS_MB_NOT);
else
add_opcode(reg, OP_CCLASS_MB);
r = add_multi_byte_cclass(cc->mbuf, reg);
}
else {
if (IS_CCLASS_NOT(cc))
add_opcode(reg, OP_CCLASS_MIX_NOT);
else
add_opcode(reg, OP_CCLASS_MIX);
r = add_bitset(reg, cc->bs);
if (r) return r;
r = add_multi_byte_cclass(cc->mbuf, reg);
}
}
return r;
}
static int
entry_repeat_range(regex_t* reg, int id, int lower, int upper)
{
#define REPEAT_RANGE_ALLOC 4
OnigRepeatRange* p;
if (reg->repeat_range_alloc == 0) {
p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
reg->repeat_range = p;
reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
}
else if (reg->repeat_range_alloc <= id) {
int n;
n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
sizeof(OnigRepeatRange) * n);
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
reg->repeat_range = p;
reg->repeat_range_alloc = n;
}
else {
p = reg->repeat_range;
}
p[id].lower = lower;
p[id].upper = upper;
return 0;
}
static int
compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info,
regex_t* reg)
{
int r;
int num_repeat = reg->num_repeat;
r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
if (r) return r;
r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
reg->num_repeat++;
if (r) return r;
r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
if (r) return r;
r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
if (
#ifdef USE_SUBEXP_CALL
reg->num_call > 0 ||
#endif
IS_QUALIFIER_IN_REPEAT(qn)) {
r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
}
else {
r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
}
if (r) return r;
r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
return r;
}
#define QUALIFIER_EXPAND_LIMIT_SIZE 50
static int
compile_length_qualifier_node(QualifierNode* qn, regex_t* reg)
{
int len, mod_tlen;
int infinite = IS_REPEAT_INFINITE(qn->upper);
int empty_info = qn->target_empty_info;
int tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
/* anychar repeat */
if (NTYPE(qn->target) == N_ANYCHAR) {
if (qn->greedy && infinite) {
if (IS_NOT_NULL(qn->next_head_exact))
return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
else
return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
}
}
if (empty_info != 0)
mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
else
mod_tlen = tlen;
if (infinite &&
(qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) {
if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) {
len = SIZE_OP_JUMP;
}
else {
len = tlen * qn->lower;
}
if (qn->greedy) {
if (IS_NOT_NULL(qn->head_exact))
len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
else if (IS_NOT_NULL(qn->next_head_exact))
len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
else
len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
}
else
len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
}
else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
len = SIZE_OP_JUMP + tlen;
}
else if (!infinite && qn->greedy &&
(qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
<= QUALIFIER_EXPAND_LIMIT_SIZE)) {
len = tlen * qn->lower;
len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
}
else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
}
else {
len = SIZE_OP_REPEAT_INC
+ mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
}
return len;
}
static int
is_anychar_star_qualifier(QualifierNode* qn)
{
if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
NTYPE(qn->target) == N_ANYCHAR)
return 1;
else
return 0;
}
static int
compile_qualifier_node(QualifierNode* qn, regex_t* reg)
{
int i, r, mod_tlen;
int infinite = IS_REPEAT_INFINITE(qn->upper);
int empty_info = qn->target_empty_info;
int tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
if (is_anychar_star_qualifier(qn)) {
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
if (IS_NOT_NULL(qn->next_head_exact)) {
if (IS_MULTILINE(reg->options))
r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
else
r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
if (r) return r;
return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
}
else {
if (IS_MULTILINE(reg->options))
return add_opcode(reg, OP_ANYCHAR_ML_STAR);
else
return add_opcode(reg, OP_ANYCHAR_STAR);
}
}
if (empty_info != 0)
mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
else
mod_tlen = tlen;
if (infinite &&
(qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) {
if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) {
if (qn->greedy) {
if (IS_NOT_NULL(qn->head_exact))
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
else if (IS_NOT_NULL(qn->next_head_exact))
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
else
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
}
else {
r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
}
if (r) return r;
}
else {
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
}
if (qn->greedy) {
if (IS_NOT_NULL(qn->head_exact)) {
r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
mod_tlen + SIZE_OP_JUMP);
if (r) return r;
add_bytes(reg, NSTRING(qn->head_exact).s, 1);
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
}
else if (IS_NOT_NULL(qn->next_head_exact)) {
r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
mod_tlen + SIZE_OP_JUMP);
if (r) return r;
add_bytes(reg, NSTRING(qn->next_head_exact).s, 1);
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
}
else {
r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
}
}
else {
r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
if (r) return r;
r = compile_tree_empty_check(qn->target, reg, empty_info);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
}
}
else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
if (r) return r;
r = compile_tree(qn->target, reg);
}
else if (!infinite && qn->greedy &&
(qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
<= QUALIFIER_EXPAND_LIMIT_SIZE)) {
int n = qn->upper - qn->lower;
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
for (i = 0; i < n; i++) {
r = add_opcode_rel_addr(reg, OP_PUSH,
(n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
if (r) return r;
r = compile_tree(qn->target, reg);
if (r) return r;
}
}
else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
if (r) return r;
r = compile_tree(qn->target, reg);
}
else {
r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
}
return r;
}
static int
compile_length_option_node(EffectNode* node, regex_t* reg)
{
int tlen;
OnigOptionType prev = reg->options;
reg->options = node->option;
tlen = compile_length_tree(node->target, reg);
reg->options = prev;
if (tlen < 0) return tlen;
if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
+ tlen + SIZE_OP_SET_OPTION;
}
else
return tlen;
}
static int
compile_option_node(EffectNode* node, regex_t* reg)
{
int r;
OnigOptionType prev = reg->options;
if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
if (r) return r;
r = add_opcode_option(reg, OP_SET_OPTION, prev);
if (r) return r;
r = add_opcode(reg, OP_FAIL);
if (r) return r;
}
reg->options = node->option;
r = compile_tree(node->target, reg);
reg->options = prev;
if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
if (r) return r;
r = add_opcode_option(reg, OP_SET_OPTION, prev);
}
return r;
}
static int
compile_length_effect_node(EffectNode* node, regex_t* reg)
{
int len;
int tlen;
if (node->type == EFFECT_OPTION)
return compile_length_option_node(node, reg);
if (node->target) {
tlen = compile_length_tree(node->target, reg);
if (tlen < 0) return tlen;
}
else
tlen = 0;
switch (node->type) {
case EFFECT_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_EFFECT_CALLED(node)) {
len = SIZE_OP_MEMORY_START_PUSH + tlen
+ SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
len += (IS_EFFECT_RECURSION(node)
? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
else
len += (IS_EFFECT_RECURSION(node)
? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
}
else
#endif
{
if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
len = SIZE_OP_MEMORY_START_PUSH;
else
len = SIZE_OP_MEMORY_START;
len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
}
break;
case EFFECT_STOP_BACKTRACK:
if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
QualifierNode* qn = &NQUALIFIER(node->target);
tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
len = tlen * qn->lower
+ SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
}
else {
len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return len;
}
static int get_char_length_tree(Node* node, regex_t* reg, int* len);
static int
compile_effect_node(EffectNode* node, regex_t* reg)
{
int r, len;
if (node->type == EFFECT_OPTION)
return compile_option_node(node, reg);
switch (node->type) {
case EFFECT_MEMORY:
#ifdef USE_SUBEXP_CALL
if (IS_EFFECT_CALLED(node)) {
r = add_opcode(reg, OP_CALL);
if (r) return r;
node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
node->state |= NST_ADDR_FIXED;
r = add_abs_addr(reg, (int )node->call_addr);
if (r) return r;
len = compile_length_tree(node->target, reg);
len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
len += (IS_EFFECT_RECURSION(node)
? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
else
len += (IS_EFFECT_RECURSION(node)
? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
r = add_opcode_rel_addr(reg, OP_JUMP, len);
if (r) return r;
}
#endif
if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
r = add_opcode(reg, OP_MEMORY_START_PUSH);
else
r = add_opcode(reg, OP_MEMORY_START);
if (r) return r;
r = add_mem_num(reg, node->regnum);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
#ifdef USE_SUBEXP_CALL
if (IS_EFFECT_CALLED(node)) {
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
r = add_opcode(reg, (IS_EFFECT_RECURSION(node)
? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
else
r = add_opcode(reg, (IS_EFFECT_RECURSION(node)
? OP_MEMORY_END_REC : OP_MEMORY_END));
if (r) return r;
r = add_mem_num(reg, node->regnum);
if (r) return r;
r = add_opcode(reg, OP_RETURN);
}
else
#endif
{
if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
r = add_opcode(reg, OP_MEMORY_END_PUSH);
else
r = add_opcode(reg, OP_MEMORY_END);
if (r) return r;
r = add_mem_num(reg, node->regnum);
}
break;
case EFFECT_STOP_BACKTRACK:
if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
QualifierNode* qn = &NQUALIFIER(node->target);
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
len = compile_length_tree(qn->target, reg);
if (len < 0) return len;
r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
if (r) return r;
r = compile_tree(qn->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP);
if (r) return r;
r = add_opcode_rel_addr(reg, OP_JUMP,
-((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
}
else {
r = add_opcode(reg, OP_PUSH_STOP_BT);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP_STOP_BT);
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
compile_length_anchor_node(AnchorNode* node, regex_t* reg)
{
int len;
int tlen = 0;
if (node->target) {
tlen = compile_length_tree(node->target, reg);
if (tlen < 0) return tlen;
}
switch (node->type) {
case ANCHOR_PREC_READ:
len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
break;
case ANCHOR_PREC_READ_NOT:
len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
break;
case ANCHOR_LOOK_BEHIND:
len = SIZE_OP_LOOK_BEHIND + tlen;
break;
case ANCHOR_LOOK_BEHIND_NOT:
len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
break;
default:
len = SIZE_OPCODE;
break;
}
return len;
}
static int
compile_anchor_node(AnchorNode* node, regex_t* reg)
{
int r, len;
switch (node->type) {
case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
#ifdef USE_WORD_BEGIN_END
case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
#endif
case ANCHOR_PREC_READ:
r = add_opcode(reg, OP_PUSH_POS);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP_POS);
break;
case ANCHOR_PREC_READ_NOT:
len = compile_length_tree(node->target, reg);
if (len < 0) return len;
r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_FAIL_POS);
break;
case ANCHOR_LOOK_BEHIND:
{
int n;
r = add_opcode(reg, OP_LOOK_BEHIND);
if (r) return r;
if (node->char_len < 0) {
r = get_char_length_tree(node->target, reg, &n);
if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
}
else
n = node->char_len;
r = add_length(reg, n);
if (r) return r;
r = compile_tree(node->target, reg);
}
break;
case ANCHOR_LOOK_BEHIND_NOT:
{
int n;
len = compile_length_tree(node->target, reg);
r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
if (r) return r;
if (node->char_len < 0) {
r = get_char_length_tree(node->target, reg, &n);
if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
}
else
n = node->char_len;
r = add_length(reg, n);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
}
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
compile_length_tree(Node* node, regex_t* reg)
{
int len, type, r;
type = NTYPE(node);
switch (type) {
case N_LIST:
len = 0;
do {
r = compile_length_tree(NCONS(node).left, reg);
if (r < 0) return r;
len += r;
} while (IS_NOT_NULL(node = NCONS(node).right));
r = len;
break;
case N_ALT:
{
int n;
n = r = 0;
do {
r += compile_length_tree(NCONS(node).left, reg);
n++;
} while (IS_NOT_NULL(node = NCONS(node).right));
r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
}
break;
case N_STRING:
if (NSTRING_IS_RAW(node))
r = compile_length_string_raw_node(&(NSTRING(node)), reg);
else
r = compile_length_string_node(node, reg);
break;
case N_CCLASS:
r = compile_length_cclass_node(&(NCCLASS(node)), reg);
break;
case N_CTYPE:
case N_ANYCHAR:
r = SIZE_OPCODE;
break;
case N_BACKREF:
{
BackrefNode* br = &(NBACKREF(node));
if (br->back_num == 1) {
r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 3)
? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
}
else {
r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
}
}
break;
#ifdef USE_SUBEXP_CALL
case N_CALL:
r = SIZE_OP_CALL;
break;
#endif
case N_QUALIFIER:
r = compile_length_qualifier_node(&(NQUALIFIER(node)), reg);
break;
case N_EFFECT:
r = compile_length_effect_node(&NEFFECT(node), reg);
break;
case N_ANCHOR:
r = compile_length_anchor_node(&(NANCHOR(node)), reg);
break;
default:
return ONIGERR_TYPE_BUG;
break;
}
return r;
}
static int
compile_tree(Node* node, regex_t* reg)
{
int n, type, len, pos, r = 0;
type = NTYPE(node);
switch (type) {
case N_LIST:
do {
r = compile_tree(NCONS(node).left, reg);
} while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
break;
case N_ALT:
{
Node* x = node;
len = 0;
do {
len += compile_length_tree(NCONS(x).left, reg);
if (NCONS(x).right != NULL) {
len += SIZE_OP_PUSH + SIZE_OP_JUMP;
}
} while (IS_NOT_NULL(x = NCONS(x).right));
pos = reg->used + len; /* goal position */
do {
len = compile_length_tree(NCONS(node).left, reg);
if (IS_NOT_NULL(NCONS(node).right)) {
r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
if (r) break;
}
r = compile_tree(NCONS(node).left, reg);
if (r) break;
if (IS_NOT_NULL(NCONS(node).right)) {
len = pos - (reg->used + SIZE_OP_JUMP);
r = add_opcode_rel_addr(reg, OP_JUMP, len);
if (r) break;
}
} while (IS_NOT_NULL(node = NCONS(node).right));
}
break;
case N_STRING:
if (NSTRING_IS_RAW(node))
r = compile_string_raw_node(&(NSTRING(node)), reg);
else
r = compile_string_node(node, reg);
break;
case N_CCLASS:
r = compile_cclass_node(&(NCCLASS(node)), reg);
break;
case N_CTYPE:
{
int op;
switch (NCTYPE(node).type) {
case CTYPE_WORD: op = OP_WORD; break;
case CTYPE_NOT_WORD: op = OP_NOT_WORD; break;
default:
return ONIGERR_TYPE_BUG;
break;
}
r = add_opcode(reg, op);
}
break;
case N_ANYCHAR:
if (IS_MULTILINE(reg->options))
r = add_opcode(reg, OP_ANYCHAR_ML);
else
r = add_opcode(reg, OP_ANYCHAR);
break;
case N_BACKREF:
{
int i;
BackrefNode* br = &(NBACKREF(node));
if (br->back_num == 1) {
n = br->back_static[0];
if (IS_IGNORECASE(reg->options)) {
r = add_opcode(reg, OP_BACKREFN_IC);
if (r) return r;
r = add_mem_num(reg, n);
}
else {
switch (n) {
case 1: r = add_opcode(reg, OP_BACKREF1); break;
case 2: r = add_opcode(reg, OP_BACKREF2); break;
case 3: r = add_opcode(reg, OP_BACKREF3); break;
default:
r = add_opcode(reg, OP_BACKREFN);
if (r) return r;
r = add_mem_num(reg, n);
break;
}
}
}
else {
int* p;
if (IS_IGNORECASE(reg->options)) {
add_opcode(reg, OP_BACKREF_MULTI_IC);
}
else {
add_opcode(reg, OP_BACKREF_MULTI);
}
if (r) return r;
add_length(reg, br->back_num);
if (r) return r;
p = BACKREFS_P(br);
for (i = br->back_num - 1; i >= 0; i--) {
r = add_mem_num(reg, p[i]);
if (r) return r;
}
}
}
break;
#ifdef USE_SUBEXP_CALL
case N_CALL:
r = compile_call(&(NCALL(node)), reg);
break;
#endif
case N_QUALIFIER:
r = compile_qualifier_node(&(NQUALIFIER(node)), reg);
break;
case N_EFFECT:
r = compile_effect_node(&NEFFECT(node), reg);
break;
case N_ANCHOR:
r = compile_anchor_node(&(NANCHOR(node)), reg);
break;
default:
#ifdef ONIG_DEBUG
fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
#endif
break;
}
return r;
}
#ifdef USE_NAMED_GROUP
static int
noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
{
int r = 0;
Node* node = *plink;
switch (NTYPE(node)) {
case N_LIST:
case N_ALT:
do {
r = noname_disable_map(&(NCONS(node).left), map, counter);
} while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
break;
case N_QUALIFIER:
{
Node** ptarget = &(NQUALIFIER(node).target);
Node* old = *ptarget;
r = noname_disable_map(ptarget, map, counter);
if (*ptarget != old && NTYPE(*ptarget) == N_QUALIFIER) {
onig_reduce_nested_qualifier(node, *ptarget);
}
}
break;
case N_EFFECT:
{
EffectNode* en = &(NEFFECT(node));
if (en->type == EFFECT_MEMORY) {
if (IS_EFFECT_NAMED_GROUP(en)) {
(*counter)++;
map[en->regnum].new_val = *counter;
en->regnum = *counter;
r = noname_disable_map(&(en->target), map, counter);
}
else {
*plink = en->target;
en->target = NULL_NODE;
onig_node_free(node);
r = noname_disable_map(plink, map, counter);
}
}
else
r = noname_disable_map(&(en->target), map, counter);
}
break;
default:
break;
}
return r;
}
static int
renumber_node_backref(Node* node, GroupNumRemap* map)
{
int i, pos, n, old_num;
int *backs;
BackrefNode* bn = &(NBACKREF(node));
if (! IS_BACKREF_NAME_REF(bn))
return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
old_num = bn->back_num;
if (IS_NULL(bn->back_dynamic))
backs = bn->back_static;
else
backs = bn->back_dynamic;
for (i = 0, pos = 0; i < old_num; i++) {
n = map[backs[i]].new_val;
if (n > 0) {
backs[pos] = n;
pos++;
}
}
bn->back_num = pos;
return 0;
}
static int
renumber_by_map(Node* node, GroupNumRemap* map)
{
int r = 0;
switch (NTYPE(node)) {
case N_LIST:
case N_ALT:
do {
r = renumber_by_map(NCONS(node).left, map);
} while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
break;
case N_QUALIFIER:
r = renumber_by_map(NQUALIFIER(node).target, map);
break;
case N_EFFECT:
r = renumber_by_map(NEFFECT(node).target, map);
break;
case N_BACKREF:
r = renumber_node_backref(node, map);
break;
default:
break;
}
return r;
}
static int
numbered_ref_check(Node* node)
{
int r = 0;
switch (NTYPE(node)) {
case N_LIST:
case N_ALT:
do {
r = numbered_ref_check(NCONS(node).left);
} while (r == 0 && IS_NOT_NULL(node = NCONS(node).right));
break;
case N_QUALIFIER:
r = numbered_ref_check(NQUALIFIER(node).target);
break;
case N_EFFECT:
r = numbered_ref_check(NEFFECT(node).target);
break;
case N_BACKREF:
if (! IS_BACKREF_NAME_REF(&(NBACKREF(node))))
return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
break;
default:
break;
}
return r;
}
static int
disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
{
int r, i, pos, counter;
BitStatusType loc;
GroupNumRemap* map;
map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
CHECK_NULL_RETURN_VAL(map, ONIGERR_MEMORY);
for (i = 1; i <= env->num_mem; i++) {
map[i].new_val = 0;
}
counter = 0;
r = noname_disable_map(root, map, &counter);
if (r != 0) return r;
r = renumber_by_map(*root, map);
if (r != 0) return r;
for (i = 1, pos = 1; i <= env->num_mem; i++) {
if (map[i].new_val > 0) {
SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
pos++;
}
}
loc = env->capture_history;
BIT_STATUS_CLEAR(env->capture_history);
for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
if (BIT_STATUS_AT(loc, i)) {
BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
}
}
env->num_mem = env->num_named;
reg->num_mem = env->num_named;
return onig_renumber_name_table(reg, map);
}
#endif /* USE_NAMED_GROUP */
#ifdef USE_SUBEXP_CALL
static int
unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
{
int i, offset;
EffectNode* en;
AbsAddrType addr;
for (i = 0; i < uslist->num; i++) {
en = &(NEFFECT(uslist->us[i].target));
if (! IS_EFFECT_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
addr = en->call_addr;
offset = uslist->us[i].offset;
BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
}
return 0;
}
#endif
#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK
static int
qualifiers_memory_node_info(Node* node)
{
int r = 0;
switch (NTYPE(node)) {
|