Skip to content

Commit 6008a75

Browse files
committed
Update to PCRE2 10.39
We also apply an respective upstream fix[1]. [1] <PCRE2Project/pcre2@d144199> Closes phpGH-7678.
1 parent df5e95b commit 6008a75

27 files changed

+4633
-4146
lines changed

NEWS

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ PHP NEWS
66
. Fixed bug #81649 (imap_(un)delete accept sequences, not single numbers).
77
(cmb)
88

9+
- PCRE:
10+
. Update bundled PCRE2 to 10.39 (cmb)
11+
912
25 Nov 2021, PHP 8.1.0
1013

1114
- Core:

ext/pcre/pcre2lib/pcre2.h

+7-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* This is the public header file for the PCRE library, second API, to be
66
#included by applications that call PCRE2 functions.
77
8-
Copyright (c) 2016-2020 University of Cambridge
8+
Copyright (c) 2016-2021 University of Cambridge
99
1010
-----------------------------------------------------------------------------
1111
Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
4242
/* The current PCRE version information. */
4343

4444
#define PCRE2_MAJOR 10
45-
#define PCRE2_MINOR 37
45+
#define PCRE2_MINOR 39
4646
#define PCRE2_PRERELEASE
47-
#define PCRE2_DATE 2021-05-26
47+
#define PCRE2_DATE 2021-10-29
4848

4949
/* When an application links to a PCRE DLL in Windows, the symbols that are
5050
imported have to be identified as such. When building PCRE2, the appropriate
@@ -84,8 +84,8 @@ set, we ensure here that it has no effect. */
8484
/* Have to include limits.h, stdlib.h, and inttypes.h to ensure that size_t and
8585
uint8_t, UCHAR_MAX, etc are defined. Some systems that do have inttypes.h do
8686
not have stdint.h, which is why we use inttypes.h, which according to the C
87-
standard is a superset of stdint.h. If none of these headers are available,
88-
the relevant values must be provided by some other means. */
87+
standard is a superset of stdint.h. If inttypes.h is not available the build
88+
will break and the relevant values must be provided by some other means. */
8989

9090
#include <limits.h>
9191
#include <stdlib.h>
@@ -152,6 +152,7 @@ D is inspected during pcre2_dfa_match() execution
152152
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
153153
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
154154
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
155+
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
155156

156157
/* These are for pcre2_jit_compile(). */
157158

@@ -311,6 +312,7 @@ pcre2_pattern_convert(). */
311312
#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE 196
312313
#define PCRE2_ERROR_TOO_MANY_CAPTURES 197
313314
#define PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED 198
315+
#define PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND 199
314316

315317

316318
/* "Expected" matching error codes: no match and partial match. */

ext/pcre/pcre2lib/pcre2_compile.c

+24-10
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2020 University of Cambridge
10+
New API code Copyright (c) 2016-2021 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -137,7 +137,7 @@ static BOOL
137137

138138
static int
139139
check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140-
compile_block *);
140+
compile_block *, int *);
141141

142142

143143
/*************************************************
@@ -782,12 +782,15 @@ are allowed. */
782782
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
783783
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
784784
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
785-
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
785+
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
786+
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
786787

787788
/* Compile time error code numbers. They are given names so that they can more
788789
easily be tracked. When a new number is added, the tables called eint1 and
789790
eint2 in pcre2posix.c may need to be updated, and a new error text must be
790-
added to compile_error_texts in pcre2_error.c. */
791+
added to compile_error_texts in pcre2_error.c. Also, the error codes in
792+
pcre2.h.in must be updated - their values are exactly 100 greater than these
793+
values. */
791794

792795
enum { ERR0 = COMPILE_ERROR_BASE,
793796
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
@@ -799,7 +802,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
799802
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
800803
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
801804
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
802-
ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98 };
805+
ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
803806

804807
/* This is a table of start-of-pattern options such as (*UTF) and settings such
805808
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@@ -7799,6 +7802,16 @@ for (;; pptr++)
77997802
}
78007803
#endif
78017804

7805+
/* \K is forbidden in lookarounds since 10.38 because that's what Perl has
7806+
done. However, there's an option, in case anyone was relying on it. */
7807+
7808+
if (cb->assert_depth > 0 && meta_arg == ESC_K &&
7809+
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
7810+
{
7811+
*errorcodeptr = ERR99;
7812+
return 0;
7813+
}
7814+
78027815
/* For the rest (including \X when Unicode is supported - if not it's
78037816
faulted at parse time), the OP value is the escape value when PCRE2_UCP is
78047817
not set; if it is set, these escapes do not show up here because they are
@@ -9148,7 +9161,7 @@ for (;; pptr++)
91489161
case META_LOOKAHEAD:
91499162
case META_LOOKAHEADNOT:
91509163
case META_LOOKAHEAD_NA:
9151-
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb);
9164+
*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
91529165
if (*errcodeptr != 0) return -1;
91539166

91549167
/* Ignore any qualifiers that follow a lookahead assertion. */
@@ -9488,16 +9501,16 @@ Arguments
94889501
retptr if not NULL, return the ket pointer here
94899502
recurses chain of recurse_check to catch mutual recursion
94909503
cb points to the compile block
9504+
lcptr points to loop counter
94919505
94929506
Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
94939507
*/
94949508

94959509
static int
94969510
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9497-
parsed_recurse_check *recurses, compile_block *cb)
9511+
parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
94989512
{
94999513
int errorcode = 0;
9500-
int loopcount = 0;
95019514
int nestlevel = 0;
95029515

95039516
cb->erroroffset = PCRE2_UNSET;
@@ -9623,7 +9636,7 @@ for (; *pptr != META_END; pptr++)
96239636
case META_LOOKBEHIND:
96249637
case META_LOOKBEHINDNOT:
96259638
case META_LOOKBEHIND_NA:
9626-
if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb))
9639+
if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
96279640
return errorcode;
96289641
break;
96299642
}
@@ -10078,7 +10091,8 @@ lengths. */
1007810091

1007910092
if (has_lookbehind)
1008010093
{
10081-
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb);
10094+
int loopcount = 0;
10095+
errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
1008210096
if (errorcode != 0) goto HAD_CB_ERROR;
1008310097
}
1008410098

ext/pcre/pcre2lib/pcre2_dfa_match.c

+44-25
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2020 University of Cambridge
10+
New API code Copyright (c) 2016-2021 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -3256,8 +3256,8 @@ BOOL has_first_cu = FALSE;
32563256
BOOL has_req_cu = FALSE;
32573257

32583258
#if PCRE2_CODE_UNIT_WIDTH == 8
3259-
BOOL memchr_not_found_first_cu = FALSE;
3260-
BOOL memchr_not_found_first_cu2 = FALSE;
3259+
PCRE2_SPTR memchr_found_first_cu = NULL;
3260+
PCRE2_SPTR memchr_found_first_cu2 = NULL;
32613261
#endif
32623262

32633263
PCRE2_UCHAR first_cu = 0;
@@ -3648,57 +3648,76 @@ for (;;)
36483648
}
36493649
}
36503650

3651-
/* Not anchored. Advance to a unique first code unit if there is one. In
3652-
8-bit mode, the use of memchr() gives a big speed up, even though we have
3653-
to call it twice in caseless mode, in order to find the earliest occurrence
3654-
of the character in either of its cases. If a call to memchr() that
3655-
searches the rest of the subject fails to find one case, remember that in
3656-
order not to keep on repeating the search. This can make a huge difference
3657-
when the strings are very long and only one case is present. */
3651+
/* Not anchored. Advance to a unique first code unit if there is one. */
36583652

36593653
else
36603654
{
36613655
if (has_first_cu)
36623656
{
36633657
if (first_cu != first_cu2) /* Caseless */
36643658
{
3659+
/* In 16-bit and 32_bit modes we have to do our own search, so can
3660+
look for both cases at once. */
3661+
36653662
#if PCRE2_CODE_UNIT_WIDTH != 8
36663663
PCRE2_UCHAR smc;
36673664
while (start_match < end_subject &&
36683665
(smc = UCHAR21TEST(start_match)) != first_cu &&
3669-
smc != first_cu2)
3666+
smc != first_cu2)
36703667
start_match++;
3668+
#else
3669+
/* In 8-bit mode, the use of memchr() gives a big speed up, even
3670+
though we have to call it twice in order to find the earliest
3671+
occurrence of the code unit in either of its cases. Caching is used
3672+
to remember the positions of previously found code units. This can
3673+
make a huge difference when the strings are very long and only one
3674+
case is actually present. */
36713675

3672-
#else /* 8-bit code units */
36733676
PCRE2_SPTR pp1 = NULL;
36743677
PCRE2_SPTR pp2 = NULL;
3675-
PCRE2_SIZE cu2size = end_subject - start_match;
3678+
PCRE2_SIZE searchlength = end_subject - start_match;
36763679

3677-
if (!memchr_not_found_first_cu)
3680+
/* If we haven't got a previously found position for first_cu, or if
3681+
the current starting position is later, we need to do a search. If
3682+
the code unit is not found, set it to the end. */
3683+
3684+
if (memchr_found_first_cu == NULL ||
3685+
start_match > memchr_found_first_cu)
36783686
{
3679-
pp1 = memchr(start_match, first_cu, end_subject - start_match);
3680-
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
3681-
else cu2size = pp1 - start_match;
3687+
pp1 = memchr(start_match, first_cu, searchlength);
3688+
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
36823689
}
36833690

3684-
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
3685-
to see if the other case is earlier, so we can set "not found" only
3686-
when both searches have returned NULL. */
3691+
/* If the start is before a previously found position, use the
3692+
previous position, or NULL if a previous search failed. */
3693+
3694+
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3695+
memchr_found_first_cu;
36873696

3688-
if (!memchr_not_found_first_cu2)
3697+
/* Do the same thing for the other case. */
3698+
3699+
if (memchr_found_first_cu2 == NULL ||
3700+
start_match > memchr_found_first_cu2)
36893701
{
3690-
pp2 = memchr(start_match, first_cu2, cu2size);
3691-
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
3702+
pp2 = memchr(start_match, first_cu2, searchlength);
3703+
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
36923704
}
36933705

3706+
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3707+
memchr_found_first_cu2;
3708+
3709+
/* Set the start to the end of the subject if neither case was found.
3710+
Otherwise, use the earlier found point. */
3711+
36943712
if (pp1 == NULL)
36953713
start_match = (pp2 == NULL)? end_subject : pp2;
36963714
else
36973715
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3698-
#endif
3716+
3717+
#endif /* 8-bit handling */
36993718
}
37003719

3701-
/* The caseful case */
3720+
/* The caseful case is much simpler. */
37023721

37033722
else
37043723
{

ext/pcre/pcre2lib/pcre2_error.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2019 University of Cambridge
10+
New API code Copyright (c) 2016-2021 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -186,6 +186,7 @@ static const unsigned char compile_error_texts[] =
186186
"script runs require Unicode support, which this version of PCRE2 does not have\0"
187187
"too many capturing groups (maximum 65535)\0"
188188
"atomic assertion expected after (?( or (?(?C)\0"
189+
"\\K is not allowed in lookarounds (but see PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)\0"
189190
;
190191

191192
/* Match-time and UTF error texts are in the same format. */

0 commit comments

Comments
 (0)