Discussion:
[PATCH 2/4] dfa: minor performance tweak
Paul Eggert
2016-12-30 09:03:20 UTC
Permalink
* lib/dfa.c (setbit_wc): Test < 0, not == EOF.
---
ChangeLog | 3 +++
lib/dfa.c | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index 27ee040..6d37212 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2016-12-30 Paul Eggert <***@cs.ucla.edu>

+ dfa: minor performance tweak
+ * lib/dfa.c (setbit_wc): Test < 0, not == EOF.
+
dfa: wrap charclass inside a struct
On my platform (gcc Ubuntu 5.4.0-6ubuntu1~16.04.4 x86-64,
en_US.utf8 locale) this makes 'grep -Fi -f list.txt list.txt >out'
diff --git a/lib/dfa.c b/lib/dfa.c
index a4ccf8c..cdc4787 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -851,7 +851,7 @@ static bool
setbit_wc (wint_t wc, charclass *c)
{
int b = wctob (wc);
- if (b == EOF)
+ if (b < 0)
return false;

setbit (b, c);
--
2.7.4
Paul Eggert
2016-12-30 09:03:22 UTC
Permalink
* lib/dfa.c (struct regex_syntax.sbit):
(struct dfa.success): Use char, not int, for array elements, since
they are all in the range 0..7.
---
ChangeLog | 5 +++++
lib/dfa.c | 4 ++--
2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index db2817f..6dfa3c7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2016-12-30 Paul Eggert <***@cs.ucla.edu>

+ dfa: shorten sbit, success
+ * lib/dfa.c (struct regex_syntax.sbit):
+ (struct dfa.success): Use char, not int, for array elements, since
+ they are all in the range 0..7.
+
dfa: simplify multibyte_prop etc.
This follows up on a change made when dfa.c was in grep, namely grep
commit c797046c7c13c2647182b919a79a4c5b4ecf82b1
diff --git a/lib/dfa.c b/lib/dfa.c
index c5c1d7f..41be5a9 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -353,7 +353,7 @@ struct regex_syntax
unsigned char eolbyte;

/* Cache of char-context values. */
- int sbit[NOTCHAR];
+ char sbit[NOTCHAR];

/* If never_trail[B], the byte B cannot be a non-initial byte in a
multibyte character. */
@@ -501,7 +501,7 @@ struct dfa
on a state that potentially could do so.
If trans[i] is non-null, fails[i] must
be null. */
- int *success; /* Table of acceptance conditions used in
+ char *success; /* Table of acceptance conditions used in
dfaexec and computed in build_state. */
state_num *newlines; /* Transitions on newlines. The entry for a
newline in any transition table is always
--
2.7.4
Paul Eggert
2016-12-30 09:03:21 UTC
Permalink
This follows up on a change made when dfa.c was in grep, namely grep
commit c797046c7c13c2647182b919a79a4c5b4ecf82b1
dated 2015-08-12 07:35:03 -0700, which removed unused multibyte support.
That earlier simplification allows for some more simplification
and trimming down here.
* lib/dfa.c (struct mb_char_classes): New member nchars_alloc.
(struct lexer_state): New mamber brack.
(struct dfa, addtok_mb): multibyte_prop elements are now char, not int,
since they must be in the range 0..3 now.
Remove members mbcsets, nmbcsets, mbcsets_alloc, since
the brack member now supersedes them.
(parse_bracket_exp): Update dfa->lex.brack instead of dfa->mbcsets.
(addtok): Use dfa->lex.brack instead of dfa->mbcsets.
(dfaparse): Remove unnecessary initializations of already-0 storage.
(free_mbdata): Free d->lex.brack.chars instead of d->mbcsets.
(dfassbuild): No need to clear sup->mbcsets.
---
ChangeLog | 18 +++++++++++++
lib/dfa.c | 86 ++++++++++++++++++---------------------------------------------
2 files changed, 42 insertions(+), 62 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 6d37212..db2817f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,23 @@
2016-12-30 Paul Eggert <***@cs.ucla.edu>

+ dfa: simplify multibyte_prop etc.
+ This follows up on a change made when dfa.c was in grep, namely grep
+ commit c797046c7c13c2647182b919a79a4c5b4ecf82b1
+ dated 2015-08-12 07:35:03 -0700, which removed unused multibyte support.
+ That earlier simplification allows for some more simplification
+ and trimming down here.
+ * lib/dfa.c (struct mb_char_classes): New member nchars_alloc.
+ (struct lexer_state): New mamber brack.
+ (struct dfa, addtok_mb): multibyte_prop elements are now char, not int,
+ since they must be in the range 0..3 now.
+ Remove members mbcsets, nmbcsets, mbcsets_alloc, since
+ the brack member now supersedes them.
+ (parse_bracket_exp): Update dfa->lex.brack instead of dfa->mbcsets.
+ (addtok): Use dfa->lex.brack instead of dfa->mbcsets.
+ (dfaparse): Remove unnecessary initializations of already-0 storage.
+ (free_mbdata): Free d->lex.brack.chars instead of d->mbcsets.
+ (dfassbuild): No need to clear sup->mbcsets.
+
dfa: minor performance tweak
* lib/dfa.c (setbit_wc): Test < 0, not == EOF.

diff --git a/lib/dfa.c b/lib/dfa.c
index cdc4787..c5c1d7f 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -333,6 +333,7 @@ struct mb_char_classes
bool invert;
wchar_t *chars; /* Normal characters. */
ptrdiff_t nchars;
+ ptrdiff_t nchars_alloc;
};

struct regex_syntax
@@ -385,6 +386,9 @@ struct lexer_state
/* Length of the multibyte representation of wctok. */
int cur_mb_len;

+ /* The most recently analyzed multibyte bracket expression. */
+ struct mb_char_classes brack;
+
/* We're separated from beginning or (, | only by zero-width characters. */
bool laststart;
};
@@ -442,9 +446,6 @@ struct dfa
bit 1 : tokens[i] is the last byte of a character, including
single-byte characters.

- if tokens[i] = MBCSET
- ("the index of mbcsets corresponding to this operator" << 2) + 3
-
e.g.
tokens
= 'single_byte_a', 'multi_byte_A', single_byte_b'
@@ -452,12 +453,7 @@ struct dfa
multibyte_prop
= 3 , 1 , 0 , 2 , 3
*/
- int *multibyte_prop;
-
- /* Array of the bracket expression in the DFA. */
- struct mb_char_classes *mbcsets;
- ptrdiff_t nmbcsets;
- ptrdiff_t mbcsets_alloc;
+ char *multibyte_prop;

/* Fields filled by the superset. */
struct dfa *superset; /* Hint of the dfa. */
@@ -970,8 +966,8 @@ find_pred (const char *str)
return NULL;
}

-/* Multibyte character handling sub-routine for lex.
- Parse a bracket expression and build a struct mb_char_classes. */
+/* Parse a bracket expression, which possibly includes multibyte
+ characters. */
static token
parse_bracket_exp (struct dfa *dfa)
{
@@ -994,28 +990,7 @@ parse_bracket_exp (struct dfa *dfa)
wint_t wc2;
wint_t wc1 = 0;

- /* Work area to build a mb_char_classes. */
- struct mb_char_classes *work_mbc;
- ptrdiff_t chars_al;
-
- chars_al = 0;
- if (dfa->localeinfo.multibyte)
- {
- dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
- &dfa->mbcsets_alloc, -1,
- sizeof *dfa->mbcsets);
-
- /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
- We will update dfa->multibyte_prop[] in addtok, because we can't
- decide the index in dfa->tokens[]. */
-
- /* Initialize work area. */
- work_mbc = &dfa->mbcsets[dfa->nmbcsets++];
- memset (work_mbc, 0, sizeof *work_mbc);
- }
- else
- work_mbc = NULL;
-
+ dfa->lex.brack.nchars = 0;
zeroset (&ccl);
FETCH_WC (dfa, c, wc, _("unbalanced ["));
if (c == '^')
@@ -1188,10 +1163,11 @@ parse_bracket_exp (struct dfa *dfa)
for (i = 0; i < n; i++)
if (!setbit_wc (folded[i], &ccl))
{
- work_mbc->chars
- = maybe_realloc (work_mbc->chars, work_mbc->nchars,
- &chars_al, -1, sizeof *work_mbc->chars);
- work_mbc->chars[work_mbc->nchars++] = folded[i];
+ dfa->lex.brack.chars
+ = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
+ &dfa->lex.brack.nchars_alloc, -1,
+ sizeof *dfa->lex.brack.chars);
+ dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
}
}
}
@@ -1205,8 +1181,8 @@ parse_bracket_exp (struct dfa *dfa)

if (dfa->localeinfo.multibyte)
{
- work_mbc->invert = invert;
- work_mbc->cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
+ dfa->lex.brack.invert = invert;
+ dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
return MBCSET;
}

@@ -1588,7 +1564,7 @@ lex (struct dfa *dfa)
}

static void
-addtok_mb (struct dfa *dfa, token t, int mbprop)
+addtok_mb (struct dfa *dfa, token t, char mbprop)
{
if (dfa->talloc == dfa->tindex)
{
@@ -1638,25 +1614,24 @@ addtok (struct dfa *dfa, token t)
if (dfa->localeinfo.multibyte && t == MBCSET)
{
bool need_or = false;
- struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
ptrdiff_t i;

/* Extract wide characters into alternations for better performance.
This does not require UTF-8. */
- for (i = 0; i < work_mbc->nchars; i++)
+ for (i = 0; i < dfa->lex.brack.nchars; i++)
{
- addtok_wc (dfa, work_mbc->chars[i]);
+ addtok_wc (dfa, dfa->lex.brack.chars[i]);
if (need_or)
addtok (dfa, OR);
need_or = true;
}
- work_mbc->nchars = 0;
+ dfa->lex.brack.nchars = 0;

- /* Characters have been handled above, so it is possible
- that the mbcset is empty now. Do nothing in that case. */
- if (work_mbc->cset != -1)
+ /* Wide characters have been handled above, so it is possible
+ that the set is empty now. Do nothing in that case. */
+ if (dfa->lex.brack.cset != -1)
{
- addtok (dfa, CSET + work_mbc->cset);
+ addtok (dfa, CSET + dfa->lex.brack.cset);
if (need_or)
addtok (dfa, OR);
}
@@ -1963,12 +1938,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
d->lex.left = len;
d->lex.lasttok = END;
d->lex.laststart = true;
- d->lex.parens = 0;
- if (d->localeinfo.multibyte)
- {
- d->lex.cur_mb_len = 0;
- memset (&d->mbs, 0, sizeof d->mbs);
- }

if (!d->syntax.syntax_bits_set)
dfaerror (_("no syntax specified"));
@@ -3333,14 +3302,8 @@ dfaisfast (struct dfa const *d)
static void
free_mbdata (struct dfa *d)
{
- ptrdiff_t i;
-
free (d->multibyte_prop);
-
- for (i = 0; i < d->nmbcsets; ++i)
- free (d->mbcsets[i].chars);
-
- free (d->mbcsets);
+ free (d->lex.brack.chars);
free (d->mb_follows.elems);

if (d->mb_trans)
@@ -3430,7 +3393,6 @@ dfassbuild (struct dfa *d)
sup->localeinfo.multibyte = false;
sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
- sup->mbcsets = NULL;
sup->superset = NULL;
sup->states = NULL;
sup->sindex = 0;
--
2.7.4
Loading...