Update. 2004-12-13 Paolo Bonzini <bonzini@gnu.org> Separate parsing and creation of the NFA. Avoided recursion on the (very unbalanced) parse tree. [BZ #611] * posix/regcomp.c (struct subexp_optimize, analyze_tree, calc_epsdest, re_dfa_add_tree_node, mark_opt_subexp_iter): Removed. (optimize_subexps, duplicate_tree, calc_first, calc_next, mark_opt_subexp): Rewritten. (preorder, postorder, lower_subexps, lower_subexp, link_nfa_nodes, create_token_tree, free_tree, free_token): New. (analyze): Accept a regex_t *. Invoke the passes via the preorder and postorder generic visitors. Do not initialize the fields in the re_dfa_t that represent the transitions. (free_dfa_content): Use free_token. (re_compile_internal): Analyze before UTF-8 optimizations. Do not include optimization of subexpressions. (create_initial_state): Fetch the DFA node index from the first node's bin_tree_t *. (optimize_utf8): Abort on unexpected nodes, including OP_DUP_QUESTION. Return on COMPLEX_BRACKET. (duplicate_node_closure): Fix comment. (duplicate_node): Do not initialize the fields in the re_dfa_t that represent the transitions. (calc_eclosure, calc_inveclosure): Do not handle OP_DELETED_SUBEXP. (create_tree): Remove final argument. All callers adjusted. Rewritten to use create_token_tree. (parse_reg_exp, parse_branch, parse_expression, parse_bracket_exp, build_charclass_op): Use create_tree or create_token_tree instead of re_dfa_add_tree_node. (parse_dup_op): Likewise. Also free the tree using free_tree for "<re>{0}", and lower OP_DUP_QUESTION to OP_ALT: "a?" is equivalent to "a|". Adjust invocation of mark_opt_subexp. (parse_sub_exp): Create a single SUBEXP node. * posix/regex_internal.c (re_dfa_add_node): Remove last parameter, always perform as if it was 1. Do not initialize OPT_SUBEXP and DUPLICATED, and initialize the DFA fields representing the transitions. * posix/regex_internal.h (re_dfa_add_node): Adjust prototype. (re_token_type_t): Move OP_DUP_PLUS and OP_DUP_QUESTION to the tokens section. Add a tree-only code SUBEXP. Remove OP_DELETED_SUBEXP. (bin_tree_t): Include a full re_token_t for TOKEN. Turn FIRST and NEXT into pointers to trees. Remove ECLOSURE. 2004-12-28 Paolo Bonzini <bonzini@gnu.org > [BZ #605] * posix/regcomp.c (parse_bracket_exp): Do not modify DFA nodes that were already created. * posix/regex_internal.c (re_dfa_add_node): Set accept_mb field in the token if needed. (create_ci_newstate, create_cd_newstate): Set accept_mb field from the tokens' field. * posix/regex_internal.h (re_token_t): Add accept_mb field. (ACCEPT_MB_NODE): Removed. * posix/regexec.c (proceed_next_node, transit_states_mb, build_sifted_states, check_arrival_add_next_nodes): Use accept_mb instead of ACCEPT_MB_NODE.
This commit is contained in:
parent
629311b74a
commit
02f3550c8b
58
ChangeLog
58
ChangeLog
@ -1,3 +1,61 @@
|
||||
2004-12-13 Paolo Bonzini <bonzini@gnu.org>
|
||||
|
||||
Separate parsing and creation of the NFA. Avoided recursion on
|
||||
the (very unbalanced) parse tree.
|
||||
[BZ #611]
|
||||
* posix/regcomp.c (struct subexp_optimize, analyze_tree, calc_epsdest,
|
||||
re_dfa_add_tree_node, mark_opt_subexp_iter): Removed.
|
||||
(optimize_subexps, duplicate_tree, calc_first, calc_next,
|
||||
mark_opt_subexp): Rewritten.
|
||||
(preorder, postorder, lower_subexps, lower_subexp, link_nfa_nodes,
|
||||
create_token_tree, free_tree, free_token): New.
|
||||
(analyze): Accept a regex_t *. Invoke the passes via the preorder and
|
||||
postorder generic visitors. Do not initialize the fields in the
|
||||
re_dfa_t that represent the transitions.
|
||||
(free_dfa_content): Use free_token.
|
||||
(re_compile_internal): Analyze before UTF-8 optimizations. Do not
|
||||
include optimization of subexpressions.
|
||||
(create_initial_state): Fetch the DFA node index from the first node's
|
||||
bin_tree_t *.
|
||||
(optimize_utf8): Abort on unexpected nodes, including OP_DUP_QUESTION.
|
||||
Return on COMPLEX_BRACKET.
|
||||
(duplicate_node_closure): Fix comment.
|
||||
(duplicate_node): Do not initialize the fields in the
|
||||
re_dfa_t that represent the transitions.
|
||||
(calc_eclosure, calc_inveclosure): Do not handle OP_DELETED_SUBEXP.
|
||||
(create_tree): Remove final argument. All callers adjusted. Rewritten
|
||||
to use create_token_tree.
|
||||
(parse_reg_exp, parse_branch, parse_expression, parse_bracket_exp,
|
||||
build_charclass_op): Use create_tree or create_token_tree instead
|
||||
of re_dfa_add_tree_node.
|
||||
(parse_dup_op): Likewise. Also free the tree using free_tree for
|
||||
"<re>{0}", and lower OP_DUP_QUESTION to OP_ALT: "a?" is equivalent
|
||||
to "a|". Adjust invocation of mark_opt_subexp.
|
||||
(parse_sub_exp): Create a single SUBEXP node.
|
||||
* posix/regex_internal.c (re_dfa_add_node): Remove last parameter,
|
||||
always perform as if it was 1. Do not initialize OPT_SUBEXP and
|
||||
DUPLICATED, and initialize the DFA fields representing the transitions.
|
||||
* posix/regex_internal.h (re_dfa_add_node): Adjust prototype.
|
||||
(re_token_type_t): Move OP_DUP_PLUS and OP_DUP_QUESTION to the tokens
|
||||
section. Add a tree-only code SUBEXP. Remove OP_DELETED_SUBEXP.
|
||||
(bin_tree_t): Include a full re_token_t for TOKEN. Turn FIRST and
|
||||
NEXT into pointers to trees. Remove ECLOSURE.
|
||||
|
||||
2004-12-28 Paolo Bonzini <bonzini@gnu.org >
|
||||
|
||||
[BZ #605]
|
||||
* posix/regcomp.c (parse_bracket_exp): Do not modify DFA nodes
|
||||
that were already created.
|
||||
* posix/regex_internal.c (re_dfa_add_node): Set accept_mb field
|
||||
in the token if needed.
|
||||
(create_ci_newstate, create_cd_newstate): Set accept_mb field
|
||||
from the tokens' field.
|
||||
* posix/regex_internal.h (re_token_t): Add accept_mb field.
|
||||
(ACCEPT_MB_NODE): Removed.
|
||||
* posix/regexec.c (proceed_next_node, transit_states_mb,
|
||||
build_sifted_states, check_arrival_add_next_nodes): Use
|
||||
accept_mb instead of ACCEPT_MB_NODE.
|
||||
|
||||
2005-01-26 Ulrich Drepper <drepper@redhat.com>
|
||||
|
||||
* debug/chk_fail.c (__chk_fail): Print program name in final message.
|
||||
|
974
posix/regcomp.c
974
posix/regcomp.c
File diff suppressed because it is too large
Load Diff
@ -1330,47 +1330,49 @@ re_node_set_remove_at (set, idx)
|
||||
Or return -1, if an error will be occured. */
|
||||
|
||||
static int
|
||||
re_dfa_add_node (dfa, token, mode)
|
||||
re_dfa_add_node (dfa, token)
|
||||
re_dfa_t *dfa;
|
||||
re_token_t token;
|
||||
int mode;
|
||||
{
|
||||
int type = token.type;
|
||||
if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
|
||||
{
|
||||
int new_nodes_alloc = dfa->nodes_alloc * 2;
|
||||
int *new_nexts, *new_indices;
|
||||
re_node_set *new_edests, *new_eclosures, *new_inveclosures;
|
||||
|
||||
re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
|
||||
new_nodes_alloc);
|
||||
if (BE (new_array == NULL, 0))
|
||||
return -1;
|
||||
dfa->nodes = new_array;
|
||||
if (mode)
|
||||
{
|
||||
int *new_nexts, *new_indices;
|
||||
re_node_set *new_edests, *new_eclosures, *new_inveclosures;
|
||||
|
||||
new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
|
||||
new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
|
||||
new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
|
||||
new_eclosures = re_realloc (dfa->eclosures, re_node_set,
|
||||
new_nodes_alloc);
|
||||
new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
|
||||
new_nodes_alloc);
|
||||
if (BE (new_nexts == NULL || new_indices == NULL
|
||||
|| new_edests == NULL || new_eclosures == NULL
|
||||
|| new_inveclosures == NULL, 0))
|
||||
return -1;
|
||||
dfa->nexts = new_nexts;
|
||||
dfa->org_indices = new_indices;
|
||||
dfa->edests = new_edests;
|
||||
dfa->eclosures = new_eclosures;
|
||||
dfa->inveclosures = new_inveclosures;
|
||||
}
|
||||
new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
|
||||
new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
|
||||
new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
|
||||
new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
|
||||
new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
|
||||
new_nodes_alloc);
|
||||
if (BE (new_nexts == NULL || new_indices == NULL
|
||||
|| new_edests == NULL || new_eclosures == NULL
|
||||
|| new_inveclosures == NULL, 0))
|
||||
return -1;
|
||||
dfa->nexts = new_nexts;
|
||||
dfa->org_indices = new_indices;
|
||||
dfa->edests = new_edests;
|
||||
dfa->eclosures = new_eclosures;
|
||||
dfa->inveclosures = new_inveclosures;
|
||||
dfa->nodes_alloc = new_nodes_alloc;
|
||||
}
|
||||
dfa->nodes[dfa->nodes_len] = token;
|
||||
dfa->nodes[dfa->nodes_len].opt_subexp = 0;
|
||||
dfa->nodes[dfa->nodes_len].duplicated = 0;
|
||||
dfa->nodes[dfa->nodes_len].constraint = 0;
|
||||
#ifdef RE_ENABLE_I18N
|
||||
dfa->nodes[dfa->nodes_len].accept_mb =
|
||||
(type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
|
||||
#endif
|
||||
dfa->nexts[dfa->nodes_len] = -1;
|
||||
re_node_set_init_empty (dfa->edests + dfa->nodes_len);
|
||||
re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
|
||||
re_node_set_init_empty (dfa->inveclosures + dfa->nodes_len);
|
||||
return dfa->nodes_len++;
|
||||
}
|
||||
|
||||
@ -1551,16 +1553,13 @@ create_ci_newstate (dfa, nodes, hash)
|
||||
re_token_type_t type = node->type;
|
||||
if (type == CHARACTER && !node->constraint)
|
||||
continue;
|
||||
#ifdef RE_ENABLE_I18N
|
||||
newstate->accept_mb |= node->accept_mb;
|
||||
#endif /* RE_ENABLE_I18N */
|
||||
|
||||
/* If the state has the halt node, the state is a halt state. */
|
||||
else if (type == END_OF_RE)
|
||||
if (type == END_OF_RE)
|
||||
newstate->halt = 1;
|
||||
#ifdef RE_ENABLE_I18N
|
||||
else if (type == COMPLEX_BRACKET
|
||||
|| type == OP_UTF8_PERIOD
|
||||
|| (type == OP_PERIOD && dfa->mb_cur_max > 1))
|
||||
newstate->accept_mb = 1;
|
||||
#endif /* RE_ENABLE_I18N */
|
||||
else if (type == OP_BACK_REF)
|
||||
newstate->has_backref = 1;
|
||||
else if (type == ANCHOR || node->constraint)
|
||||
@ -1611,15 +1610,13 @@ create_cd_newstate (dfa, nodes, context, hash)
|
||||
|
||||
if (type == CHARACTER && !constraint)
|
||||
continue;
|
||||
/* If the state has the halt node, the state is a halt state. */
|
||||
else if (type == END_OF_RE)
|
||||
newstate->halt = 1;
|
||||
#ifdef RE_ENABLE_I18N
|
||||
else if (type == COMPLEX_BRACKET
|
||||
|| type == OP_UTF8_PERIOD
|
||||
|| (type == OP_PERIOD && dfa->mb_cur_max > 1))
|
||||
newstate->accept_mb = 1;
|
||||
newstate->accept_mb |= node->accept_mb;
|
||||
#endif /* RE_ENABLE_I18N */
|
||||
|
||||
/* If the state has the halt node, the state is a halt state. */
|
||||
if (type == END_OF_RE)
|
||||
newstate->halt = 1;
|
||||
else if (type == OP_BACK_REF)
|
||||
newstate->has_backref = 1;
|
||||
else if (type == ANCHOR)
|
||||
|
@ -189,16 +189,16 @@ typedef enum
|
||||
OP_CLOSE_SUBEXP = EPSILON_BIT | 1,
|
||||
OP_ALT = EPSILON_BIT | 2,
|
||||
OP_DUP_ASTERISK = EPSILON_BIT | 3,
|
||||
OP_DUP_PLUS = EPSILON_BIT | 4,
|
||||
OP_DUP_QUESTION = EPSILON_BIT | 5,
|
||||
ANCHOR = EPSILON_BIT | 6,
|
||||
OP_DELETED_SUBEXP = EPSILON_BIT | 7,
|
||||
ANCHOR = EPSILON_BIT | 4,
|
||||
|
||||
/* Tree type, these are used only by tree. */
|
||||
CONCAT = 16,
|
||||
SUBEXP = 17,
|
||||
|
||||
/* Token type, these are used only by token. */
|
||||
OP_OPEN_BRACKET = 17,
|
||||
OP_DUP_PLUS = 18,
|
||||
OP_DUP_QUESTION,
|
||||
OP_OPEN_BRACKET,
|
||||
OP_CLOSE_BRACKET,
|
||||
OP_CHARSET_RANGE,
|
||||
OP_OPEN_DUP_NUM,
|
||||
@ -287,6 +287,7 @@ typedef struct
|
||||
unsigned int duplicated : 1;
|
||||
unsigned int opt_subexp : 1;
|
||||
#ifdef RE_ENABLE_I18N
|
||||
unsigned int accept_mb : 1;
|
||||
/* These 2 bits can be moved into the union if needed (e.g. if running out
|
||||
of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
|
||||
unsigned int mb_partial : 1;
|
||||
@ -295,8 +296,6 @@ typedef struct
|
||||
} re_token_t;
|
||||
|
||||
#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
|
||||
#define ACCEPT_MB_NODE(type) \
|
||||
((type) >= OP_PERIOD && (type) <= OP_UTF8_PERIOD)
|
||||
|
||||
struct re_string_t
|
||||
{
|
||||
@ -432,15 +431,14 @@ struct bin_tree_t
|
||||
struct bin_tree_t *parent;
|
||||
struct bin_tree_t *left;
|
||||
struct bin_tree_t *right;
|
||||
struct bin_tree_t *first;
|
||||
struct bin_tree_t *next;
|
||||
|
||||
re_token_t token;
|
||||
|
||||
/* `node_idx' is the index in dfa->nodes, if `type' == 0.
|
||||
Otherwise `type' indicate the type of this node. */
|
||||
re_token_type_t type;
|
||||
int node_idx;
|
||||
|
||||
int first;
|
||||
int next;
|
||||
re_node_set eclosure;
|
||||
};
|
||||
typedef struct bin_tree_t bin_tree_t;
|
||||
|
||||
@ -680,7 +678,7 @@ static void re_node_set_remove_at (re_node_set *set, int idx) internal_function;
|
||||
(re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
|
||||
#define re_node_set_empty(p) ((p)->nelem = 0)
|
||||
#define re_node_set_free(set) re_free ((set)->elems)
|
||||
static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token, int mode) internal_function;
|
||||
static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token) internal_function;
|
||||
static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa,
|
||||
const re_node_set *nodes) internal_function;
|
||||
static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err,
|
||||
|
@ -1262,7 +1262,7 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
|
||||
re_token_type_t type = dfa->nodes[node].type;
|
||||
|
||||
#ifdef RE_ENABLE_I18N
|
||||
if (ACCEPT_MB_NODE (type))
|
||||
if (dfa->nodes[node].accept_mb)
|
||||
naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
|
||||
else
|
||||
#endif /* RE_ENABLE_I18N */
|
||||
@ -1624,15 +1624,13 @@ build_sifted_states (mctx, sctx, str_idx, cur_dest)
|
||||
int naccepted = 0;
|
||||
int ret;
|
||||
|
||||
#if defined DEBUG || defined RE_ENABLE_I18N
|
||||
re_token_type_t type = dfa->nodes[prev_node].type;
|
||||
#endif
|
||||
#ifdef DEBUG
|
||||
re_token_type_t type = dfa->nodes[prev_node].type;
|
||||
assert (!IS_EPSILON_NODE (type));
|
||||
#endif
|
||||
#ifdef RE_ENABLE_I18N
|
||||
/* If the node may accept `multi byte'. */
|
||||
if (ACCEPT_MB_NODE (type))
|
||||
if (dfa->nodes[prev_node].accept_mb)
|
||||
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
|
||||
str_idx, sctx->last_str_idx);
|
||||
#endif /* RE_ENABLE_I18N */
|
||||
@ -2486,7 +2484,7 @@ transit_state_mb (mctx, pstate)
|
||||
}
|
||||
|
||||
/* How many bytes the node can accept? */
|
||||
if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
|
||||
if (dfa->nodes[cur_node_idx].accept_mb)
|
||||
naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
|
||||
re_string_cur_idx (&mctx->input));
|
||||
if (naccepted == 0)
|
||||
@ -3020,15 +3018,13 @@ check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
|
||||
{
|
||||
int naccepted = 0;
|
||||
int cur_node = cur_nodes->elems[cur_idx];
|
||||
#if defined DEBUG || defined RE_ENABLE_I18N
|
||||
re_token_type_t type = dfa->nodes[cur_node].type;
|
||||
#endif
|
||||
#ifdef DEBUG
|
||||
re_token_type_t type = dfa->nodes[cur_node].type;
|
||||
assert (!IS_EPSILON_NODE (type));
|
||||
#endif
|
||||
#ifdef RE_ENABLE_I18N
|
||||
/* If the node may accept `multi byte'. */
|
||||
if (ACCEPT_MB_NODE (type))
|
||||
if (dfa->nodes[cur_node].accept_mb)
|
||||
{
|
||||
naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
|
||||
str_idx);
|
||||
|
Loading…
x
Reference in New Issue
Block a user