Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <007f7c7408aa227ee7b0fb3d82e44b8142db6ff4.1682681245.git.nabijaczleweli@nabijaczleweli.xyz>
Date: Fri, 28 Apr 2023 13:39:06 +0200
From: наб <nabijaczleweli@...ijaczleweli.xyz>
Cc: musl@...ts.openwall.com
Subject: [PATCH v2 1/2] regex: add BSD-style REG_STARTEND

This extension originates from the BSD, and is available under the
illumos gate as well as glibc (but is buggy there).

REG_STARTEND affects regexec() in the following way:
the string to be matched is
  [string + pmatch->rm_so, string + pmatch->rm_eo)
rather than
  [string, string + strlen(string))

This allows matching data with embedded NULs
(and on other implementations avoids a strlen() over the input string),
and limiting the far side of the matched string
(thus potentially matching unterminated strings).

The matches written to pmatch are still referenced to string
(not string + pmatch->rm_so).

As an example, the following program:
	#include <locale.h>
	#include <regex.h>
	#include <stdio.h>
	int main(int c, char ** v) {
		setlocale(LC_ALL, "");
		regex_t r;
		regcomp(&r, v[1], 0);
		regmatch_t dt[2] = {{1, 4}};
		printf("%d\n", regexec(&r, v[2] ?: "_a\0cdef", sizeof(dt)/sizeof(*dt), dt, REG_STARTEND));
		printf("%d, %d; %d, %d\n", (int)dt[0].rm_so, (int)dt[0].rm_eo, (int)dt[1].rm_so, (int)dt[1].rm_eo);
	}
produces
	$ ./a.out '^a'  # matching in "a\0c"
	0
	1, 2; -1, -1
	$ ./a.out 'c$'
	0
	3, 4; -1, -1
	$ ./a.out 'c$' '_ac'  # matching in "ac\0"
	1
	1, 4; 0, 0
	$ ./a.out '^\(a\).\1$' _abad  # matching in "aba"
	0
	1, 4; 1, 2
	$ ./a.out 'ać' '_aaćdef'  # ć is two bytes in UTF-8
	1                         # matching in "aa\xC4"
	1, 4; 0, 0
	$ ./a.out 'ać' '_aćdef'   # matching in "ać"
	0
	1, 4; -1, -1
	$ ./a.out '^a.c$'
	0
	1, 4; -1, -1
	$ ./a.out 'a[^-]c$'
	0
	1, 4; -1, -1
the last two don't hold in musl with just this patch, though.

The bulk of the implementation is concentrated in GET_NEXT_WCHAR():
if REG_STARTEND was requested, we smooth over NULs by replacing them
with (wchar_t)-1, and limit how many bytes may be consumed by mbtowc()
when getting to the end, and, if 0, return L'\0'.

To that end, GET_NEXT_WCHAR() continues to behave like mbtowc(),
in that yielding an L'\0' means end-of-string; this is heavily baked
into the matchers, and embedded NULs are unnameable within the regex
anyway.
---
v2: fixed style and made the message probably a bit saner; NFC

Series tested with the v2 tst-regex-startend.c available at
  https://sourceware.org/pipermail/libc-alpha/2023-April/147564.html
and which should port to more compilers.

Keep me in CC: please.

 include/regex.h     |  1 +
 src/regex/regexec.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/include/regex.h b/include/regex.h
index dce21771..01ab326e 100644
--- a/include/regex.h
+++ b/include/regex.h
@@ -31,6 +31,7 @@ typedef struct {
 
 #define REG_NOTBOL      1
 #define REG_NOTEOL      2
+#define REG_STARTEND    4
 
 #define REG_OK          0
 #define REG_NOMATCH     1
diff --git a/src/regex/regexec.c b/src/regex/regexec.c
index 253b0e14..763dde58 100644
--- a/src/regex/regexec.c
+++ b/src/regex/regexec.c
@@ -44,17 +44,23 @@
 
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend);
 
 /***********************************************************************
  from tre-match-utils.h
 ***********************************************************************/
 
+
 #define GET_NEXT_WCHAR() do {                                                 \
+    size_t max_len = startend ?                                               \
+        MIN((const char *)string + startend->rm_eo - str_byte, MB_LEN_MAX) :  \
+        MB_LEN_MAX;                                                           \
     prev_c = next_c; pos += pos_add_next;                                     \
-    if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
+    if (!max_len) { next_c = L'\0'; pos_add_next = 1; }                       \
+    else if ((pos_add_next = mbtowc(&next_c, str_byte, max_len)) <= 0) {      \
         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
-        else pos_add_next++;                                                  \
+        else { pos_add_next++; if (startend) next_c = -1; };                  \
     }                                                                         \
     str_byte += pos_add_next;                                                 \
   } while (0)
@@ -169,11 +175,11 @@ typedef struct {
 static reg_errcode_t
 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
 		      regoff_t *match_tags, int eflags,
-		      regoff_t *match_end_ofs)
+		      regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = -1;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -591,11 +597,12 @@ typedef struct tre_backtrack_struct {
 
 static reg_errcode_t
 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
-		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
+		       regoff_t *match_tags, int eflags,
+		       regoff_t *match_end_ofs, const regmatch_t *startend)
 {
   /* State variables required by GET_NEXT_WCHAR. */
   tre_char_t prev_c = 0, next_c = 0;
-  const char *str_byte = string;
+  const char *str_byte = (const char *)string + (startend ? startend->rm_so : 0);
   regoff_t pos = 0;
   regoff_t pos_add_next = 1;
 #ifdef TRE_MBSTATE
@@ -777,7 +784,7 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
 	  /* Get the substring we need to match against.  Remember to
 	     turn off REG_NOSUB temporarily. */
 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
-			  tnfa, tags, pos);
+			  tnfa, tags, pos, startend);
 	  so = pmatch[bt].rm_so;
 	  eo = pmatch[bt].rm_eo;
 	  bt_len = eo - so;
@@ -928,9 +935,11 @@ tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
    endpoint values. */
 static void
 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
-		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
+		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo,
+		const regmatch_t *startend)
 {
   tre_submatch_data_t *submatch_data;
+  regoff_t offset = startend ? startend->rm_so : 0;
   unsigned int i, j;
   int *parents;
 
@@ -955,6 +964,8 @@ tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
 	     was not part of the match. */
 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
+	  else
+	    { pmatch[i].rm_so += offset; pmatch[i].rm_eo += offset; }
 
 	  i++;
 	}
@@ -999,6 +1010,7 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
   reg_errcode_t status;
   regoff_t *tags = NULL, eo;
+  const regmatch_t *startend = (eflags & REG_STARTEND) ? pmatch : NULL;
   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
   if (tnfa->num_tags > 0 && nmatch > 0)
     {
@@ -1011,17 +1023,19 @@ regexec(const regex_t *restrict preg, const char *restrict string,
   if (tnfa->have_backrefs)
     {
       /* The regex has back references, use the backtracking matcher. */
-      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo,
+                                      startend);
     }
   else
     {
       /* Exact matching, no back references, use the parallel matcher. */
-      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
+      status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo,
+                                     startend);
     }
 
   if (status == REG_OK)
     /* A match was found, so fill the submatch registers. */
-    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
+    tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo, startend);
   if (tags)
     xfree(tags);
   return status;
-- 
2.30.2


Download attachment "signature.asc" of type "application/pgp-signature" (834 bytes)

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.