|
Message-ID: <007f7c7408aa227ee7b0fb3d82e44b8142db6ff4.1682681245.git.nabijaczleweli@nabijaczleweli.xyz>
Date: Fri, 28 Apr 2023 13:39:06 +0200
From: наб <nabijaczleweli@...ijaczleweli.xyz>
Cc: musl@...ts.openwall.com
Subject: [PATCH v2 1/2] regex: add BSD-style REG_STARTEND
This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but is buggy there).
REG_STARTEND affects regexec() in the following way:
the string to be matched is
[string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
[string, string + strlen(string))
This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(thus potentially matching unterminated strings).
The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).
As an example, the following program:
#include <locale.h>
#include <regex.h>
#include <stdio.h>
int main(int c, char ** v) {
setlocale(LC_ALL, "");
regex_t r;
regcomp(&r, v[1], 0);
regmatch_t dt[2] = {{1, 4}};
printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
}
produces
$ ./a.out '^a' # matching in "a\0c"
0
1, 2; -1, -1
$ ./a.out 'c$'
0
3, 4; -1, -1
$ ./a.out 'c$' '_ac' # matching in "ac\0"
1
1, 4; 0, 0
$ ./a.out '^\(a\).\1$' _abad # matching in "aba"
0
1, 4; 1, 2
$ ./a.out 'ać' '_aaćdef' # ć is two bytes in UTF-8
1 # matching in "aa\xC4"
1, 4; 0, 0
$ ./a.out 'ać' '_aćdef' # matching in "ać"
0
1, 4; -1, -1
$ ./a.out '^a.c$'
0
1, 4; -1, -1
$ ./a.out 'a[^-]c$'
0
1, 4; -1, -1
the last two don't hold in musl with just this patch, though.
The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1, and limit how many bytes may be consumed by mbtowc()
when getting to the end, and, if 0, return L'\0'.
To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string; this is heavily baked
into the matchers, and embedded NULs are unnameable within the regex
anyway.
---
v2: fixed style and made the message probably a bit saner; NFC
Series tested with the v2 tst-regex-startend.c available at
https://sourceware.org/pipermail/libc-alpha/2023-April/147564.html
and which should port to more compilers.
Keep me in CC: please.
include/regex.h | 1 +
src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
2 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
#define REG_NOTBOL 1
#define REG_NOTEOL 2
+#define REG_STARTEND 4
#define REG_OK 0
#define REG_NOMATCH 1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..763dde58 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
static void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+ const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+ const regmatch_t *startend);
/***********************************************************************
from tre-match-utils.h
***********************************************************************/
+
#define GET_NEXT_WCHAR() do { \
+ size_t max_len = startend ? \
+ MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) : \
+ MB_LEN_MAX; \
prev_c = next_c; pos += pos_add_next; \
- if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
+ if (!max_len) { next_c = L'\0'; pos_add_next = 1; } \
+ else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) { \
if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
- else pos_add_next++; \
+ else { pos_add_next++; if (startend) next_c = -1; }; \
} \
str_byte += pos_add_next; \
} while (0)
@@ -169,11 +175,11 @@ typedef struct {
static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
regoff_t *match_tags, int eflags,
- regoff_t *match_end_ofs)
+ regoff_t *match_end_ofs, const regmatch_t *startend)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
- const char *str_byte = string;
+ const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
regoff_t pos = -1;
regoff_t pos_add_next = 1;
#ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
- regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+ regoff_t *match_tags, int eflags,
+ regoff_t *match_end_ofs, const regmatch_t *startend)
{
/* State variables required by GET_NEXT_WCHAR. */
tre_char_t prev_c = 0, next_c = 0;
- const char *str_byte = string;
+ const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
regoff_t pos = 0;
regoff_t pos_add_next = 1;
#ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
/* Get the substring we need to match against. Remember to
turn off REG_NOSUB temporarily. */
tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
- tnfa, tags, pos);
+ tnfa, tags, pos, startend);
so = pmatch[bt].rm_so;
eo = pmatch[bt].rm_eo;
bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
endpoint values. */
static void
tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
- const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+ const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+ const regmatch_t *startend)
{
tre_submatch_data_t *submatch_data;
+ regoff_t offset = startend ? startend->rm_so : 0;
unsigned int i, j;
int *parents;
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
was not part of the match. */
if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+ else
+ { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
i++;
}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
reg_errcode_t status;
regoff_t *tags = NULL, eo;
+ const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
if (tnfa->cflags & REG_NOSUB) nmatch = 0;
if (tnfa->num_tags > 0 && nmatch > 0)
{
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
if (tnfa->have_backrefs)
{
/* The regex has back references, use the backtracking matcher. */
- status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+ status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+ startend);
}
else
{
/* Exact matching, no back references, use the parallel matcher. */
- status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+ status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+ startend);
}
if (status == REG_OK)
/* A match was found, so fill the submatch registers. */
- tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+ tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
if (tags)
xfree(tags);
return status;
--
2.30.2
Download attachment "signature.asc" of type "application/pgp-signature" (834 bytes)
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.