234 lines
6.5 KiB
C++
234 lines
6.5 KiB
C++
|
#include "regexp.h"
|
||
|
// TODO: make a little more multi-byte safe
|
||
|
|
||
|
|
||
|
|
||
|
// regexp match functions
|
||
|
|
||
|
// A match means the entire string TEXT is used up in matching.
|
||
|
// In the pattern string:
|
||
|
// `*' matches any sequence of characters (zero or more)
|
||
|
// `?' matches any character
|
||
|
// [SET] matches any character in the specified set,
|
||
|
// [!SET] or [^SET] matches any character not in the specified set.
|
||
|
|
||
|
// A set is composed of characters or ranges; a range looks like
|
||
|
// character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the
|
||
|
// minimal set of characters allowed in the [..] pattern construct.
|
||
|
// Other characters are allowed (ie. 8 bit characters) if your system
|
||
|
// will support them.
|
||
|
|
||
|
// To suppress the special syntactic significance of any of `[]*?!^-\',
|
||
|
// and match the character exactly, precede it with a `\'.
|
||
|
|
||
|
enum {
|
||
|
MATCH_VALID = 1, /* valid match */
|
||
|
MATCH_END, /* premature end of pattern string */
|
||
|
MATCH_ABORT, /* premature end of text string */
|
||
|
MATCH_RANGE, /* match failure on [..] construct */
|
||
|
MATCH_LITERAL, /* match failure on literal match */
|
||
|
MATCH_PATTERN, /* bad pattern */
|
||
|
};
|
||
|
|
||
|
enum {
|
||
|
PATTERN_VALID = 0, /* valid pattern */
|
||
|
PATTERN_ESC = -1, /* literal escape at end of pattern */
|
||
|
PATTERN_RANGE = -2, /* malformed range in [..] construct */
|
||
|
PATTERN_CLOSE = -3, /* no end bracket in [..] construct */
|
||
|
PATTERN_EMPTY = -4, /* [..] contstruct is empty */
|
||
|
};
|
||
|
|
||
|
int Matche(const regchar_t *p, const regchar_t *t);
|
||
|
|
||
|
// TODO: make this multi-byte aware
|
||
|
int matche_after_star(const regchar_t *p, const regchar_t *t)
|
||
|
{
|
||
|
register int match = 0;
|
||
|
register regchar_t nextp;
|
||
|
/* pass over existing ? and * in pattern */
|
||
|
while ( *p == '?' || *p == '*' )
|
||
|
{
|
||
|
/* take one char for each ? and + */
|
||
|
if (*p == '?')
|
||
|
{
|
||
|
/* if end of text then no match */
|
||
|
if (!*t++) return MATCH_ABORT;
|
||
|
}
|
||
|
/* move to next char in pattern */
|
||
|
p++;
|
||
|
}
|
||
|
/* if end of pattern we have matched regardless of text left */
|
||
|
if (!*p) return MATCH_VALID;
|
||
|
/* get the next character to match which must be a literal or '[' */
|
||
|
nextp = *p;
|
||
|
if (nextp == '\\')
|
||
|
{
|
||
|
nextp = p[1];
|
||
|
/* if end of text then we have a bad pattern */
|
||
|
if (!nextp) return MATCH_PATTERN;
|
||
|
}
|
||
|
/* Continue until we run out of text or definite result seen */
|
||
|
do
|
||
|
{
|
||
|
/* a precondition for matching is that the next character
|
||
|
in the pattern match the next character in the text or that
|
||
|
the next pattern char is the beginning of a range. Increment
|
||
|
text pointer as we go here */
|
||
|
if (nextp == *t || nextp == '[') match = Matche(p, t);
|
||
|
/* if the end of text is reached then no match */
|
||
|
if (!*t++) match = MATCH_ABORT;
|
||
|
}
|
||
|
while ( match != MATCH_VALID && match != MATCH_ABORT && match != MATCH_PATTERN);
|
||
|
/* return result */
|
||
|
return match;
|
||
|
}
|
||
|
|
||
|
|
||
|
int Matche(const regchar_t *p, const regchar_t *t)
|
||
|
{
|
||
|
regchar_t range_start, range_end; /* start and end in range */
|
||
|
|
||
|
bool invert; /* is this [..] or [!..] */
|
||
|
bool member_match; /* have I matched the [..] construct? */
|
||
|
bool loop; /* should I terminate? */
|
||
|
|
||
|
for ( ; *p; p++, t++)
|
||
|
{
|
||
|
/* if this is the end of the text then this is the end of the match */
|
||
|
if (!*t)
|
||
|
{
|
||
|
return (*p == '*' && *++p == '\0') ? MATCH_VALID : MATCH_ABORT;
|
||
|
}
|
||
|
/* determine and react to pattern type */
|
||
|
switch (*p)
|
||
|
{
|
||
|
case '?': /* single any character match */
|
||
|
break;
|
||
|
case '*': /* multiple any character match */
|
||
|
return matche_after_star (p, t);
|
||
|
|
||
|
/* [..] construct, single member/exclusion character match */
|
||
|
case '[':
|
||
|
{
|
||
|
/* move to beginning of range */
|
||
|
p++;
|
||
|
/* check if this is a member match or exclusion match */
|
||
|
invert = false;
|
||
|
if (*p == '!' || *p == '^')
|
||
|
{
|
||
|
invert = true;
|
||
|
p++;
|
||
|
}
|
||
|
/* if closing bracket here or at range start then we have a malformed pattern */
|
||
|
if (*p == ']')
|
||
|
return MATCH_PATTERN;
|
||
|
|
||
|
member_match = false;
|
||
|
loop = true;
|
||
|
while (loop)
|
||
|
{
|
||
|
/* if end of construct then loop is done */
|
||
|
if (*p == ']')
|
||
|
{
|
||
|
loop = false;
|
||
|
continue;
|
||
|
}
|
||
|
/* matching a '!', '^', '-', '\' or a ']' */
|
||
|
if (*p == '\\')
|
||
|
range_start = range_end = *++p;
|
||
|
else
|
||
|
range_start = range_end = *p;
|
||
|
/* if end of pattern then bad pattern (Missing ']') */
|
||
|
if (!*p)
|
||
|
return MATCH_PATTERN;
|
||
|
/* check for range bar */
|
||
|
if (*++p == '-')
|
||
|
{
|
||
|
/* get the range end */
|
||
|
range_end = *++p;
|
||
|
/* if end of pattern or construct then bad pattern */
|
||
|
if (range_end == '\0' || range_end == ']') return MATCH_PATTERN;
|
||
|
/* special character range end */
|
||
|
if (range_end == '\\')
|
||
|
{
|
||
|
range_end = *++p;
|
||
|
/* if end of text then we have a bad pattern */
|
||
|
if (!range_end) return MATCH_PATTERN;
|
||
|
}
|
||
|
/* move just beyond this range */
|
||
|
p++;
|
||
|
}
|
||
|
/* if the text character is in range then match found.
|
||
|
make sure the range letters have the proper
|
||
|
relationship to one another before comparison */
|
||
|
if (range_start < range_end)
|
||
|
{
|
||
|
if (*t >= range_start && *t <= range_end)
|
||
|
{
|
||
|
member_match = true;
|
||
|
loop = false;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (*t >= range_end && *t <= range_start)
|
||
|
{
|
||
|
member_match = true;
|
||
|
loop = false;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
/* if there was a match in an exclusion set then no match */
|
||
|
/* if there was no match in a member set then no match */
|
||
|
if ((invert && member_match) || !(invert || member_match))
|
||
|
return MATCH_RANGE;
|
||
|
/* if this is not an exclusion then skip the rest of the [...] construct that already matched. */
|
||
|
if (member_match)
|
||
|
{
|
||
|
while (p && *p != ']')
|
||
|
{
|
||
|
/* bad pattern (Missing ']') */
|
||
|
if (!*p)
|
||
|
return MATCH_PATTERN;
|
||
|
/* skip exact match */
|
||
|
if (*p == '\\')
|
||
|
{
|
||
|
p++;
|
||
|
/* if end of text then we have a bad pattern */
|
||
|
if (!*p)
|
||
|
return MATCH_PATTERN;
|
||
|
}
|
||
|
/* move to next pattern char */
|
||
|
p++;
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
case '\\': /* next character is quoted and must match exactly */
|
||
|
/* move pattern pointer to quoted char and fall through */
|
||
|
p++;
|
||
|
/* if end of text then we have a bad pattern */
|
||
|
if (!*p)
|
||
|
return MATCH_PATTERN;
|
||
|
/* must match this character exactly */
|
||
|
default:
|
||
|
if (*p != *t)
|
||
|
return MATCH_LITERAL;
|
||
|
}
|
||
|
}
|
||
|
/* if end of text not reached then the pattern fails */
|
||
|
if (*t)
|
||
|
return MATCH_END;
|
||
|
else return MATCH_VALID;
|
||
|
}
|
||
|
|
||
|
bool Match(const regchar_t *match, const regchar_t *string)
|
||
|
{
|
||
|
if (!match)
|
||
|
return true;
|
||
|
int error_type;
|
||
|
|
||
|
error_type = Matche(match, string);
|
||
|
return (error_type == MATCH_VALID);
|
||
|
}
|