From 4391e6a408f8fa969dde364c35b745a4bc5fa33d Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 14 Feb 2023 10:37:15 +0100 Subject: [PATCH 1/6] core: convert Pattern<'a> into Pattern (RFC 2295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As per RFC 2295, add a Haystack trait describing something that can be searched in and make core::str::Pattern (and related types) generic on that trait. This will allow Pattern to be used for types other than str (most notably OsStr). The change mostly follows the RFC though there are some differences in Haystack trait. Most notably, instead of separate StartCursor and EndCursors types, there’s one Cursor type instead. This eliminate the need for methods converting between the two types of cursors. Conversion from cursor to offset isn’t included either since as far as I can tell it’s not needed. A generic code operating on Haystack doesn’t need a concept of offset. It can operate on cursors instead. Lastly, rather than range_to_self as suggested in RFC this commit implements split_at_cursor_unchecked which simplifies default implementation of strip_prefix_of and strip_suffix_of. For now leave Pattern, Haystack et al in core::str::pattern. Since they are no longer str-specific, I’ll move them to core::pattern in future commit. This one leaves them in place to make the diff smaller. Issue: https://github.com/rust-lang/rust/issues/49802 --- library/alloc/src/str.rs | 4 +- library/alloc/src/string.rs | 8 +- library/alloc/tests/str.rs | 4 +- library/core/src/str/iter.rs | 92 +++---- library/core/src/str/mod.rs | 52 ++-- library/core/src/str/pattern.rs | 229 +++++++++++------- .../save-analysis-fail/foo.rs | 6 +- tests/run-make-fulldeps/save-analysis/foo.rs | 6 +- tests/rustdoc/async-fn.rs | 6 +- .../bound/assoc-fn-bound-root-obligation.rs | 2 +- .../assoc-fn-bound-root-obligation.stderr | 2 +- 11 files changed, 229 insertions(+), 182 deletions(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index afbe5cfaf8ef9..c1447fdc967f7 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -268,7 +268,7 @@ impl str { without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn replace<'a, P: Pattern<'a>>(&'a self, from: P, to: &str) -> String { + pub fn replace<'a, P: Pattern<&'a str>>(&'a self, from: P, to: &str) -> String { let mut result = String::new(); let mut last_end = 0; for (start, part) in self.match_indices(from) { @@ -308,7 +308,7 @@ impl str { #[must_use = "this returns the replaced string as a new allocation, \ without modifying the original"] #[stable(feature = "str_replacen", since = "1.16.0")] - pub fn replacen<'a, P: Pattern<'a>>(&'a self, pat: P, to: &str, count: usize) -> String { + pub fn replacen<'a, P: Pattern<&'a str>>(&'a self, pat: P, to: &str, count: usize) -> String { // Hope to reduce the times of re-allocation let mut result = String::with_capacity(32); let mut last_end = 0; diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 2b843647dd510..9d41049d1a81e 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1371,7 +1371,7 @@ impl String { #[unstable(feature = "string_remove_matches", reason = "new API", issue = "72826")] pub fn remove_matches<'a, P>(&'a mut self, pat: P) where - P: for<'x> Pattern<'x>, + P: for<'x> Pattern<&'x str>, { use core::str::pattern::Searcher; @@ -2174,10 +2174,10 @@ impl<'a> Extend> for String { reason = "API not fully fleshed out and ready to be stabilized", issue = "27721" )] -impl<'a, 'b> Pattern<'a> for &'b String { - type Searcher = <&'b str as Pattern<'a>>::Searcher; +impl<'a, 'b> Pattern<&'a str> for &'b String { + type Searcher = <&'b str as Pattern<&'a str>>::Searcher; - fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<'a>>::Searcher { + fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<&'a str>>::Searcher { self[..].into_searcher(haystack) } diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 4d182be02c9e9..cc2110f5673a0 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1879,7 +1879,7 @@ mod pattern { fn cmp_search_to_vec<'a>( rev: bool, - pat: impl Pattern<'a, Searcher: ReverseSearcher<'a>>, + pat: impl Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, haystack: &'a str, right: Vec, ) { @@ -2143,7 +2143,7 @@ fn different_str_pattern_forwarding_lifetimes() { fn foo<'a, P>(p: P) where - for<'b> &'b P: Pattern<'a>, + for<'b> &'b P: Pattern<&'a str>, { for _ in 0..3 { "asdf".find(&p); diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 95c682f42d0c9..bdb1f1d297af6 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -361,7 +361,7 @@ macro_rules! derive_pattern_clone { (clone $t:ident with |$s:ident| $e:expr) => { impl<'a, P> Clone for $t<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { let $s = self; @@ -374,7 +374,7 @@ macro_rules! derive_pattern_clone { /// This macro generates two public iterator structs /// wrapping a private internal one that makes use of the `Pattern` API. /// -/// For all patterns `P: Pattern<'a>` the following items will be +/// For all patterns `P: Pattern<&'a str>` the following items will be /// generated (generics omitted): /// /// struct $forward_iterator($internal_iterator); @@ -434,12 +434,14 @@ macro_rules! generate_pattern_iterators { } => { $(#[$forward_iterator_attribute])* $(#[$common_stability_attribute])* - pub struct $forward_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + pub struct $forward_iterator<'a, P: Pattern<&'a str>>( + pub(super) $internal_iterator<'a, P> + ); $(#[$common_stability_attribute])* impl<'a, P> fmt::Debug for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple(stringify!($forward_iterator)) @@ -449,7 +451,7 @@ macro_rules! generate_pattern_iterators { } $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { + impl<'a, P: Pattern<&'a str>> Iterator for $forward_iterator<'a, P> { type Item = $iterty; #[inline] @@ -461,7 +463,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Clone for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { $forward_iterator(self.0.clone()) @@ -470,12 +472,14 @@ macro_rules! generate_pattern_iterators { $(#[$reverse_iterator_attribute])* $(#[$common_stability_attribute])* - pub struct $reverse_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + pub struct $reverse_iterator<'a, P: Pattern<&'a str>>( + pub(super) $internal_iterator<'a, P> + ); $(#[$common_stability_attribute])* impl<'a, P> fmt::Debug for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple(stringify!($reverse_iterator)) @@ -487,7 +491,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Iterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { type Item = $iterty; @@ -500,7 +504,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Clone for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { $reverse_iterator(self.0.clone()) @@ -508,12 +512,12 @@ macro_rules! generate_pattern_iterators { } #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} + impl<'a, P: Pattern<&'a str>> FusedIterator for $forward_iterator<'a, P> {} #[stable(feature = "fused", since = "1.26.0")] impl<'a, P> FusedIterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, {} generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, @@ -528,7 +532,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> DoubleEndedIterator for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { #[inline] fn next_back(&mut self) -> Option<$iterty> { @@ -539,7 +543,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> DoubleEndedIterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { #[inline] fn next_back(&mut self) -> Option<$iterty> { @@ -559,7 +563,7 @@ derive_pattern_clone! { with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } } -pub(super) struct SplitInternal<'a, P: Pattern<'a>> { +pub(super) struct SplitInternal<'a, P: Pattern<&'a str>> { pub(super) start: usize, pub(super) end: usize, pub(super) matcher: P::Searcher, @@ -569,7 +573,7 @@ pub(super) struct SplitInternal<'a, P: Pattern<'a>> { impl<'a, P> fmt::Debug for SplitInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitInternal") @@ -582,7 +586,7 @@ where } } -impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitInternal<'a, P> { #[inline] fn get_end(&mut self) -> Option<&'a str> { if !self.finished { @@ -639,7 +643,7 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { if self.finished { return None; @@ -676,7 +680,7 @@ impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { #[inline] fn next_back_inclusive(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { if self.finished { return None; @@ -746,7 +750,7 @@ generate_pattern_iterators! { delegate double ended; } -impl<'a, P: Pattern<'a>> Split<'a, P> { +impl<'a, P: Pattern<&'a str>> Split<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -769,7 +773,7 @@ impl<'a, P: Pattern<'a>> Split<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplit<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplit<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -810,7 +814,7 @@ generate_pattern_iterators! { delegate double ended; } -impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitTerminator<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -833,7 +837,7 @@ impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplitTerminator<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplitTerminator<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -861,7 +865,7 @@ derive_pattern_clone! { with |s| SplitNInternal { iter: s.iter.clone(), ..*s } } -pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { +pub(super) struct SplitNInternal<'a, P: Pattern<&'a str>> { pub(super) iter: SplitInternal<'a, P>, /// The number of splits remaining pub(super) count: usize, @@ -869,7 +873,7 @@ pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { impl<'a, P> fmt::Debug for SplitNInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitNInternal") @@ -879,7 +883,7 @@ where } } -impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitNInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { match self.count { @@ -898,7 +902,7 @@ impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { match self.count { 0 => None, @@ -937,7 +941,7 @@ generate_pattern_iterators! { delegate single ended; } -impl<'a, P: Pattern<'a>> SplitN<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitN<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -960,7 +964,7 @@ impl<'a, P: Pattern<'a>> SplitN<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplitN<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplitN<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -988,18 +992,18 @@ derive_pattern_clone! { with |s| MatchIndicesInternal(s.0.clone()) } -pub(super) struct MatchIndicesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); +pub(super) struct MatchIndicesInternal<'a, P: Pattern<&'a str>>(pub(super) P::Searcher); impl<'a, P> fmt::Debug for MatchIndicesInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("MatchIndicesInternal").field(&self.0).finish() } } -impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> MatchIndicesInternal<'a, P> { #[inline] fn next(&mut self) -> Option<(usize, &'a str)> { self.0 @@ -1011,7 +1015,7 @@ impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<(usize, &'a str)> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { self.0 .next_match_back() @@ -1043,18 +1047,18 @@ derive_pattern_clone! { with |s| MatchesInternal(s.0.clone()) } -pub(super) struct MatchesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); +pub(super) struct MatchesInternal<'a, P: Pattern<&'a str>>(pub(super) P::Searcher); impl<'a, P> fmt::Debug for MatchesInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("MatchesInternal").field(&self.0).finish() } } -impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> MatchesInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. @@ -1067,7 +1071,7 @@ impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. self.0.next_match_back().map(|(a, b)| unsafe { @@ -1213,7 +1217,7 @@ pub struct SplitAsciiWhitespace<'a> { /// /// [`split_inclusive`]: str::split_inclusive #[stable(feature = "split_inclusive", since = "1.51.0")] -pub struct SplitInclusive<'a, P: Pattern<'a>>(pub(super) SplitInternal<'a, P>); +pub struct SplitInclusive<'a, P: Pattern<&'a str>>(pub(super) SplitInternal<'a, P>); #[stable(feature = "split_whitespace", since = "1.1.0")] impl<'a> Iterator for SplitWhitespace<'a> { @@ -1335,7 +1339,7 @@ impl<'a> SplitAsciiWhitespace<'a> { } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str>> Iterator for SplitInclusive<'a, P> { type Item = &'a str; #[inline] @@ -1345,7 +1349,7 @@ impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitInclusive").field("0", &self.0).finish() } @@ -1353,14 +1357,14 @@ impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, // FIXME(#26925) Remove in favor of `#[derive(Clone)]` #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: Clone>> Clone for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str, Searcher: Clone>> Clone for SplitInclusive<'a, P> { fn clone(&self) -> Self { SplitInclusive(self.0.clone()) } } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator +impl<'a, P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>> DoubleEndedIterator for SplitInclusive<'a, P> { #[inline] @@ -1370,9 +1374,9 @@ impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a>> FusedIterator for SplitInclusive<'a, P> {} +impl<'a, P: Pattern<&'a str>> FusedIterator for SplitInclusive<'a, P> {} -impl<'a, P: Pattern<'a>> SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitInclusive<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index ab2f8520ecb33..d2ad655b42af8 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -1067,7 +1067,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pub fn contains<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> bool { pat.is_contained_in(self) } @@ -1093,7 +1093,7 @@ impl str { /// assert!(!bananas.starts_with("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pub fn starts_with<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> bool { pat.is_prefix_of(self) } @@ -1121,7 +1121,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] pub fn ends_with<'a, P>(&'a self, pat: P) -> bool where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { pat.is_suffix_of(self) } @@ -1170,7 +1170,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + pub fn find<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Option { pat.into_searcher(self).next_match().map(|(i, _)| i) } @@ -1218,7 +1218,7 @@ impl str { #[inline] pub fn rfind<'a, P>(&'a self, pat: P) -> Option where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { pat.into_searcher(self).next_match_back().map(|(i, _)| i) } @@ -1338,7 +1338,7 @@ impl str { /// [`split_whitespace`]: str::split_whitespace #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { + pub fn split<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Split<'a, P> { Split(SplitInternal { start: 0, end: self.len(), @@ -1378,7 +1378,7 @@ impl str { /// ``` #[stable(feature = "split_inclusive", since = "1.51.0")] #[inline] - pub fn split_inclusive<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitInclusive<'a, P> { + pub fn split_inclusive<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitInclusive<'a, P> { SplitInclusive(SplitInternal { start: 0, end: self.len(), @@ -1435,7 +1435,7 @@ impl str { #[inline] pub fn rsplit<'a, P>(&'a self, pat: P) -> RSplit<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplit(self.split(pat).0) } @@ -1484,7 +1484,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { + pub fn split_terminator<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitTerminator<'a, P> { SplitTerminator(SplitInternal { allow_trailing_empty: false, ..self.split(pat).0 }) } @@ -1532,7 +1532,7 @@ impl str { #[inline] pub fn rsplit_terminator<'a, P>(&'a self, pat: P) -> RSplitTerminator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplitTerminator(self.split_terminator(pat).0) } @@ -1585,7 +1585,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn splitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { + pub fn splitn<'a, P: Pattern<&'a str>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { SplitN(SplitNInternal { iter: self.split(pat).0, count: n }) } @@ -1636,7 +1636,7 @@ impl str { #[inline] pub fn rsplitn<'a, P>(&'a self, n: usize, pat: P) -> RSplitN<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplitN(self.splitn(n, pat).0) } @@ -1654,7 +1654,7 @@ impl str { /// ``` #[stable(feature = "str_split_once", since = "1.52.0")] #[inline] - pub fn split_once<'a, P: Pattern<'a>>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> { + pub fn split_once<'a, P: Pattern<&'a str>>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> { let (start, end) = delimiter.into_searcher(self).next_match()?; // SAFETY: `Searcher` is known to return valid indices. unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } @@ -1674,7 +1674,7 @@ impl str { #[inline] pub fn rsplit_once<'a, P>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { let (start, end) = delimiter.into_searcher(self).next_match_back()?; // SAFETY: `Searcher` is known to return valid indices. @@ -1714,7 +1714,7 @@ impl str { /// ``` #[stable(feature = "str_matches", since = "1.2.0")] #[inline] - pub fn matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> Matches<'a, P> { + pub fn matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Matches<'a, P> { Matches(MatchesInternal(pat.into_searcher(self))) } @@ -1752,7 +1752,7 @@ impl str { #[inline] pub fn rmatches<'a, P>(&'a self, pat: P) -> RMatches<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RMatches(self.matches(pat).0) } @@ -1796,7 +1796,7 @@ impl str { /// ``` #[stable(feature = "str_match_indices", since = "1.5.0")] #[inline] - pub fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { + pub fn match_indices<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> MatchIndices<'a, P> { MatchIndices(MatchIndicesInternal(pat.into_searcher(self))) } @@ -1840,7 +1840,7 @@ impl str { #[inline] pub fn rmatch_indices<'a, P>(&'a self, pat: P) -> RMatchIndices<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RMatchIndices(self.match_indices(pat).0) } @@ -2057,7 +2057,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { let mut i = 0; let mut j = 0; @@ -2104,7 +2104,7 @@ impl str { #[must_use = "this returns the trimmed string as a new slice, \ without modifying the original"] #[stable(feature = "trim_direction", since = "1.30.0")] - pub fn trim_start_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_start_matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> &'a str { let mut i = self.len(); let mut matcher = pat.into_searcher(self); if let Some((a, _)) = matcher.next_reject() { @@ -2137,7 +2137,7 @@ impl str { #[must_use = "this returns the remaining substring as a new slice, \ without modifying the original"] #[stable(feature = "str_strip", since = "1.45.0")] - pub fn strip_prefix<'a, P: Pattern<'a>>(&'a self, prefix: P) -> Option<&'a str> { + pub fn strip_prefix<'a, P: Pattern<&'a str>>(&'a self, prefix: P) -> Option<&'a str> { prefix.strip_prefix_of(self) } @@ -2166,8 +2166,8 @@ impl str { #[stable(feature = "str_strip", since = "1.45.0")] pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a str> where - P: Pattern<'a>, -

>::Searcher: ReverseSearcher<'a>, + P: Pattern<&'a str>, +

>::Searcher: ReverseSearcher<&'a str>, { suffix.strip_suffix_of(self) } @@ -2210,7 +2210,7 @@ impl str { #[stable(feature = "trim_direction", since = "1.30.0")] pub fn trim_end_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { let mut j = 0; let mut matcher = pat.into_searcher(self); @@ -2254,7 +2254,7 @@ impl str { note = "superseded by `trim_start_matches`", suggestion = "trim_start_matches" )] - pub fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_left_matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> &'a str { self.trim_start_matches(pat) } @@ -2299,7 +2299,7 @@ impl str { )] pub fn trim_right_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { self.trim_end_matches(pat) } diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index 19da6d2fbecbc..b891fd9a4d337 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -45,13 +45,13 @@ use crate::slice::memchr; // Pattern -/// A string pattern. +/// A pattern which can be matched against a [`Haystack`]. /// -/// A `Pattern<'a>` expresses that the implementing type -/// can be used as a string pattern for searching in a [`&'a str`][str]. +/// A `Pattern` expresses that the implementing type can be used as a pattern +/// for searching in a `H`. /// -/// For example, both `'a'` and `"aa"` are patterns that -/// would match at index `1` in the string `"baaaab"`. +/// For example, character `'a'` and string `"aa"` are patterns that would match +/// at index `1` in the string `"baaaab"`. /// /// The trait itself acts as a builder for an associated /// [`Searcher`] type, which does the actual work of finding @@ -96,46 +96,49 @@ use crate::slice::memchr; /// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); /// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); /// ``` -pub trait Pattern<'a>: Sized { +pub trait Pattern: Sized { /// Associated searcher for this pattern - type Searcher: Searcher<'a>; + type Searcher: Searcher; /// Constructs the associated searcher from /// `self` and the `haystack` to search in. - fn into_searcher(self, haystack: &'a str) -> Self::Searcher; + fn into_searcher(self, haystack: H) -> Self::Searcher; /// Checks whether the pattern matches anywhere in the haystack #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { + fn is_contained_in(self, haystack: H) -> bool { self.into_searcher(haystack).next_match().is_some() } /// Checks whether the pattern matches at the front of the haystack #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _)) + fn is_prefix_of(self, haystack: H) -> bool { + matches!( + self.into_searcher(haystack).next(), + SearchStep::Match(start, _) if start == haystack.cursor_at_front() + ) } /// Checks whether the pattern matches at the back of the haystack #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where - Self::Searcher: ReverseSearcher<'a>, - { - matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j) + fn is_suffix_of(self, haystack: H) -> bool + where Self::Searcher: ReverseSearcher { + matches!( + self.into_searcher(haystack).next_back(), + SearchStep::Match(_, end) if end == haystack.cursor_at_back() + ) } /// Removes the pattern from the front of haystack, if it matches. #[inline] - fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() { - debug_assert_eq!( - start, 0, - "The first search step from Searcher \ - must include the first character" - ); + fn strip_prefix_of(self, haystack: H) -> Option { + if let SearchStep::Match(start, end) = self.into_searcher(haystack).next() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!(start == haystack.cursor_at_front(), + "The first search step from Searcher \ + must include the first character"); // SAFETY: `Searcher` is known to return valid indices. - unsafe { Some(haystack.get_unchecked(len..)) } + Some(unsafe { haystack.split_at_cursor_unchecked(end) }.1) } else { None } @@ -143,39 +146,60 @@ pub trait Pattern<'a>: Sized { /// Removes the pattern from the back of haystack, if it matches. #[inline] - fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> - where - Self::Searcher: ReverseSearcher<'a>, - { + fn strip_suffix_of(self, haystack: H) -> Option + where Self::Searcher: ReverseSearcher { if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { - debug_assert_eq!( - end, - haystack.len(), - "The first search step from ReverseSearcher \ - must include the last character" - ); + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!(end == haystack.cursor_at_back(), + "The first search step from ReverseSearcher \ + must include the last character"); // SAFETY: `Searcher` is known to return valid indices. - unsafe { Some(haystack.get_unchecked(..start)) } + Some(unsafe { haystack.split_at_cursor_unchecked(start) }.0) } else { None } } } +// Haystack + +/// A type which can be searched in using a [`Pattern`]. +/// +/// The trait is used in combination with [`Pattern`] trait to express a pattern +/// that can be used to search for elements in given haystack. +pub trait Haystack: Sized + Copy { + /// A cursor representing position in the haystack or its end. + type Cursor: Copy + PartialOrd; + + /// Returns cursor pointing at the beginning of the haystack. + fn cursor_at_front(&self) -> Self::Cursor; + + /// Returns cursor pointing at the end of the haystack. + fn cursor_at_back(&self) -> Self::Cursor; + + /// Splits haystack into two at given cursor position. + /// + /// Note that splitting a haystack isn’t guaranteed to preserve total + /// length. That is, each separate part’s length may be longer than length + /// of the original haystack. This property is preserved for `&str` and + /// `&[T]` haystacks but not for `&OsStr`. + unsafe fn split_at_cursor_unchecked(self, cursor: Self::Cursor) -> (Self, Self); +} + // Searcher /// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. #[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum SearchStep { +pub enum SearchStep { /// Expresses that a match of the pattern has been found at /// `haystack[a..b]`. - Match(usize, usize), + Match(T, T), /// Expresses that `haystack[a..b]` has been rejected as a possible match /// of the pattern. /// /// Note that there might be more than one `Reject` between two `Match`es, /// there is no requirement for them to be combined into one. - Reject(usize, usize), + Reject(T, T), /// Expresses that every byte of the haystack has been visited, ending /// the iteration. Done, @@ -193,11 +217,11 @@ pub enum SearchStep { /// [`next()`][Searcher::next] methods are required to lie on valid utf8 /// boundaries in the haystack. This enables consumers of this trait to /// slice the haystack without additional runtime checks. -pub unsafe trait Searcher<'a> { +pub unsafe trait Searcher { /// Getter for the underlying string to be searched in /// /// Will always return the same [`&str`][str]. - fn haystack(&self) -> &'a str; + fn haystack(&self) -> H; /// Performs the next search step starting from the front. /// @@ -220,7 +244,7 @@ pub unsafe trait Searcher<'a> { /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` /// might produce the stream /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` - fn next(&mut self) -> SearchStep; + fn next(&mut self) -> SearchStep; /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. /// @@ -229,7 +253,7 @@ pub unsafe trait Searcher<'a> { /// `(start_match, end_match)`, where start_match is the index of where /// the match begins, and end_match is the index after the end of the match. #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { + fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next() { SearchStep::Match(a, b) => return Some((a, b)), @@ -245,7 +269,7 @@ pub unsafe trait Searcher<'a> { /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges /// of this and [`next_match`][Searcher::next_match] will overlap. #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { + fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next() { SearchStep::Reject(a, b) => return Some((a, b)), @@ -270,7 +294,7 @@ pub unsafe trait Searcher<'a> { /// /// For the reason why this trait is marked unsafe, see the /// parent trait [`Searcher`]. -pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { +pub unsafe trait ReverseSearcher: Searcher { /// Performs the next search step starting from the back. /// /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` @@ -292,12 +316,12 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` /// might produce the stream /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. - fn next_back(&mut self) -> SearchStep; + fn next_back(&mut self) -> SearchStep; /// Finds the next [`Match`][SearchStep::Match] result. /// See [`next_back()`][ReverseSearcher::next_back]. #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { + fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next_back() { SearchStep::Match(a, b) => return Some((a, b)), @@ -310,7 +334,7 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// Finds the next [`Reject`][SearchStep::Reject] result. /// See [`next_back()`][ReverseSearcher::next_back]. #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { + fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { loop { match self.next_back() { SearchStep::Reject(a, b) => return Some((a, b)), @@ -342,16 +366,35 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because /// the pattern `"aa"` in the haystack `"aaa"` matches as either /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. -pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} +pub trait DoubleEndedSearcher: ReverseSearcher {} + +///////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +///////////////////////////////////////////////////////////////////////////// + +impl<'a> Haystack for &'a str { + type Cursor = usize; + + #[inline(always)] + fn cursor_at_front(&self) -> usize { 0 } + #[inline(always)] + fn cursor_at_back(&self) -> usize { self.len() } + + #[inline(always)] + unsafe fn split_at_cursor_unchecked(self, cursor: usize) -> (Self, Self) { + // SAFETY: Caller promises position is a character boundary. + unsafe { (self.get_unchecked(..cursor), self.get_unchecked(cursor..)) } + } +} ///////////////////////////////////////////////////////////////////////////// // Impl for char ///////////////////////////////////////////////////////////////////////////// -/// Associated type for `>::Searcher`. +/// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a> { - haystack: &'a str, +pub struct CharSearcher { + haystack: H, // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` // This invariant can be broken *within* next_match and next_match_back, however // they must exit with fingers on valid code point boundaries. @@ -359,12 +402,12 @@ pub struct CharSearcher<'a> { /// Imagine that it exists before the byte at its index, i.e. /// `haystack[finger]` is the first byte of the slice we must inspect during /// forward searching - finger: usize, + finger: H::Cursor, /// `finger_back` is the current byte index of the reverse search. /// Imagine that it exists after the byte at its index, i.e. /// haystack[finger_back - 1] is the last byte of the slice we must inspect during /// forward searching (and thus the first byte to be inspected when calling next_back()). - finger_back: usize, + finger_back: H::Cursor, /// The character being searched for needle: char, @@ -375,7 +418,7 @@ pub struct CharSearcher<'a> { utf8_encoded: [u8; 4], } -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { +unsafe impl<'a> Searcher<&'a str> for CharSearcher<&'a str> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -453,7 +496,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { // let next_reject use the default implementation from the Searcher trait } -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { +unsafe impl<'a> ReverseSearcher<&'a str> for CharSearcher<&'a str> { #[inline] fn next_back(&mut self) -> SearchStep { let old_finger = self.finger_back; @@ -527,7 +570,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { // let next_reject_back use the default implementation from the Searcher trait } -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} +impl<'a> DoubleEndedSearcher<&'a str> for CharSearcher<&'a str> {} /// Searches for chars that are equal to a given [`char`]. /// @@ -536,8 +579,8 @@ impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} /// ``` /// assert_eq!("Hello world".find('o'), Some(4)); /// ``` -impl<'a> Pattern<'a> for char { - type Searcher = CharSearcher<'a>; +impl<'a> Pattern<&'a str> for char { + type Searcher = CharSearcher<&'a str>; #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { @@ -576,7 +619,7 @@ impl<'a> Pattern<'a> for char { #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where - Self::Searcher: ReverseSearcher<'a>, + Self::Searcher: ReverseSearcher<&'a str>, { self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) } @@ -584,7 +627,7 @@ impl<'a> Pattern<'a> for char { #[inline] fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> where - Self::Searcher: ReverseSearcher<'a>, + Self::Searcher: ReverseSearcher<&'a str>, { self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) } @@ -639,7 +682,7 @@ struct MultiCharEqSearcher<'a, C: MultiCharEq> { char_indices: super::CharIndices<'a>, } -impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { +impl<'a, C: MultiCharEq> Pattern<&'a str> for MultiCharEqPattern { type Searcher = MultiCharEqSearcher<'a, C>; #[inline] @@ -648,7 +691,7 @@ impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { } } -unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> Searcher<&'a str> for MultiCharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -673,7 +716,7 @@ unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { } } -unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<&'a str> for MultiCharEqSearcher<'a, C> { #[inline] fn next_back(&mut self) -> SearchStep { let s = &mut self.char_indices; @@ -693,7 +736,7 @@ unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, } } -impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} +impl<'a, C: MultiCharEq> DoubleEndedSearcher<&'a str> for MultiCharEqSearcher<'a, C> {} ///////////////////////////////////////////////////////////////////////////// @@ -724,7 +767,7 @@ macro_rules! pattern_methods { #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where - $t: ReverseSearcher<'a>, + $t: ReverseSearcher<&'a str>, { ($pmap)(self).is_suffix_of(haystack) } @@ -732,7 +775,7 @@ macro_rules! pattern_methods { #[inline] fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> where - $t: ReverseSearcher<'a>, + $t: ReverseSearcher<&'a str>, { ($pmap)(self).strip_suffix_of(haystack) } @@ -774,16 +817,16 @@ macro_rules! searcher_methods { }; } -/// Associated type for `<[char; N] as Pattern<'a>>::Searcher`. +/// Associated type for `<[char; N] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] pub struct CharArraySearcher<'a, const N: usize>( - as Pattern<'a>>::Searcher, + as Pattern<&'a str>>::Searcher, ); -/// Associated type for `<&[char; N] as Pattern<'a>>::Searcher`. +/// Associated type for `<&[char; N] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( - as Pattern<'a>>::Searcher, + as Pattern<&'a str>>::Searcher, ); /// Searches for chars that are equal to any of the [`char`]s in the array. @@ -794,15 +837,15 @@ pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( /// assert_eq!("Hello world".find(['l', 'l']), Some(2)); /// assert_eq!("Hello world".find(['l', 'l']), Some(2)); /// ``` -impl<'a, const N: usize> Pattern<'a> for [char; N] { +impl<'a, const N: usize> Pattern<&'a str> for [char; N] { pattern_methods!(CharArraySearcher<'a, N>, MultiCharEqPattern, CharArraySearcher); } -unsafe impl<'a, const N: usize> Searcher<'a> for CharArraySearcher<'a, N> { +unsafe impl<'a, const N: usize> Searcher<&'a str> for CharArraySearcher<'a, N> { searcher_methods!(forward); } -unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> { +unsafe impl<'a, const N: usize> ReverseSearcher<&'a str> for CharArraySearcher<'a, N> { searcher_methods!(reverse); } @@ -814,15 +857,15 @@ unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> /// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); /// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); /// ``` -impl<'a, 'b, const N: usize> Pattern<'a> for &'b [char; N] { +impl<'a, 'b, const N: usize> Pattern<&'a str> for &'b [char; N] { pattern_methods!(CharArrayRefSearcher<'a, 'b, N>, MultiCharEqPattern, CharArrayRefSearcher); } -unsafe impl<'a, 'b, const N: usize> Searcher<'a> for CharArrayRefSearcher<'a, 'b, N> { +unsafe impl<'a, 'b, const N: usize> Searcher<&'a str> for CharArrayRefSearcher<'a, 'b, N> { searcher_methods!(forward); } -unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher<'a, 'b, N> { +unsafe impl<'a, 'b, const N: usize> ReverseSearcher<&'a str> for CharArrayRefSearcher<'a, 'b, N> { searcher_methods!(reverse); } @@ -832,19 +875,19 @@ unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher // Todo: Change / Remove due to ambiguity in meaning. -/// Associated type for `<&[char] as Pattern<'a>>::Searcher`. +/// Associated type for `<&[char] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); +pub struct CharSliceSearcher<'a, 'b>( as Pattern<&'a str>>::Searcher); -unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<&'a str> for CharSliceSearcher<'a, 'b> { searcher_methods!(forward); } -unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { +unsafe impl<'a, 'b> ReverseSearcher<&'a str> for CharSliceSearcher<'a, 'b> { searcher_methods!(reverse); } -impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} +impl<'a, 'b> DoubleEndedSearcher<&'a str> for CharSliceSearcher<'a, 'b> {} /// Searches for chars that are equal to any of the [`char`]s in the slice. /// @@ -854,7 +897,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} /// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2)); /// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2)); /// ``` -impl<'a, 'b> Pattern<'a> for &'b [char] { +impl<'a, 'b> Pattern<&'a str> for &'b [char] { pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); } @@ -862,9 +905,9 @@ impl<'a, 'b> Pattern<'a> for &'b [char] { // Impl for F: FnMut(char) -> bool ///////////////////////////////////////////////////////////////////////////// -/// Associated type for `>::Searcher`. +/// Associated type for `>::Searcher`. #[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) +pub struct CharPredicateSearcher<'a, F>( as Pattern<&'a str>>::Searcher) where F: FnMut(char) -> bool; @@ -879,21 +922,21 @@ where .finish() } } -unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> +unsafe impl<'a, F> Searcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool, { searcher_methods!(forward); } -unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> +unsafe impl<'a, F> ReverseSearcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool, { searcher_methods!(reverse); } -impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {} +impl<'a, F> DoubleEndedSearcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {} /// Searches for [`char`]s that match the given predicate. /// @@ -903,7 +946,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: Fn /// assert_eq!("Hello world".find(char::is_uppercase), Some(0)); /// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1)); /// ``` -impl<'a, F> Pattern<'a> for F +impl<'a, F> Pattern<&'a str> for F where F: FnMut(char) -> bool, { @@ -915,7 +958,7 @@ where ///////////////////////////////////////////////////////////////////////////// /// Delegates to the `&str` impl. -impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { +impl<'a, 'b, 'c> Pattern<&'a str> for &'c &'b str { pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); } @@ -933,7 +976,7 @@ impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { /// ``` /// assert_eq!("Hello world".find("world"), Some(6)); /// ``` -impl<'a, 'b> Pattern<'a> for &'b str { +impl<'a, 'b> Pattern<&'a str> for &'b str { type Searcher = StrSearcher<'a, 'b>; #[inline] @@ -1008,7 +1051,7 @@ impl<'a, 'b> Pattern<'a> for &'b str { ///////////////////////////////////////////////////////////////////////////// #[derive(Clone, Debug)] -/// Associated type for `<&str as Pattern<'a>>::Searcher`. +/// Associated type for `<&str as Pattern<&'a str>>::Searcher`. pub struct StrSearcher<'a, 'b> { haystack: &'a str, needle: &'b str, @@ -1059,7 +1102,7 @@ impl<'a, 'b> StrSearcher<'a, 'b> { } } -unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -1149,7 +1192,7 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { } } -unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { +unsafe impl<'a, 'b> ReverseSearcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn next_back(&mut self) -> SearchStep { match self.searcher { diff --git a/tests/run-make-fulldeps/save-analysis-fail/foo.rs b/tests/run-make-fulldeps/save-analysis-fail/foo.rs index c5a70605e04cb..dccf83f6ec3a1 100644 --- a/tests/run-make-fulldeps/save-analysis-fail/foo.rs +++ b/tests/run-make-fulldeps/save-analysis-fail/foo.rs @@ -403,17 +403,17 @@ impl Iterator for nofields { } } -trait Pattern<'a> { +trait Pattern<&'a str> { type Searcher; } struct CharEqPattern; -impl<'a> Pattern<'a> for CharEqPattern { +impl<'a> Pattern<&'a str> for CharEqPattern { type Searcher = CharEqPattern; } -struct CharSearcher<'a>(>::Searcher); +struct CharSearcher<'a>(>::Searcher); pub trait Error {} diff --git a/tests/run-make-fulldeps/save-analysis/foo.rs b/tests/run-make-fulldeps/save-analysis/foo.rs index 384589de3b480..5c6fe02cbdc12 100644 --- a/tests/run-make-fulldeps/save-analysis/foo.rs +++ b/tests/run-make-fulldeps/save-analysis/foo.rs @@ -402,17 +402,17 @@ impl Iterator for nofields { } } -trait Pattern<'a> { +trait Pattern<&'a str> { type Searcher; } struct CharEqPattern; -impl<'a> Pattern<'a> for CharEqPattern { +impl<'a> Pattern<&'a str> for CharEqPattern { type Searcher = CharEqPattern; } -struct CharSearcher<'a>(>::Searcher); +struct CharSearcher<'a>(>::Searcher); pub trait Error {} diff --git a/tests/rustdoc/async-fn.rs b/tests/rustdoc/async-fn.rs index 70bcbcb6ff44a..3f641473d308a 100644 --- a/tests/rustdoc/async-fn.rs +++ b/tests/rustdoc/async-fn.rs @@ -46,7 +46,7 @@ impl Foo { pub async fn mut_self(mut self, mut first: usize) {} } -pub trait Pattern<'a> {} +pub trait Pattern<&'a str> {} pub trait Trait {} // @has async_fn/fn.const_generics.html @@ -91,5 +91,5 @@ impl Foo { // @has - '//pre[@class="rust item-decl"]' "pub async fn named<'a, 'b>(foo: &'a str) -> &'b str" pub async fn named<'a, 'b>(foo: &'a str) -> &'b str {} // @has async_fn/fn.named_trait.html -// @has - '//pre[@class="rust item-decl"]' "pub async fn named_trait<'a, 'b>(foo: impl Pattern<'a>) -> impl Pattern<'b>" -pub async fn named_trait<'a, 'b>(foo: impl Pattern<'a>) -> impl Pattern<'b> {} +// @has - '//pre[@class="rust item-decl"]' "pub async fn named_trait<'a, 'b>(foo: impl Pattern<&'a str>) -> impl Pattern<'b>" +pub async fn named_trait<'a, 'b>(foo: impl Pattern<&'a str>) -> impl Pattern<'b> {} diff --git a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs index f9a9347641143..8a047a082c4a4 100644 --- a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs +++ b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs @@ -3,7 +3,7 @@ fn strip_lf(s: &str) -> &str { //~^ ERROR expected a `FnMut<(char,)>` closure, found `u8` //~| NOTE expected an `FnMut<(char,)>` closure, found `u8` //~| HELP the trait `FnMut<(char,)>` is not implemented for `u8` - //~| HELP the following other types implement trait `Pattern<'a>`: + //~| HELP the following other types implement trait `Pattern<&'a str>`: //~| NOTE required for `u8` to implement `Pattern<'_>` } diff --git a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr index ce9ab2d811ae1..e97aaa6834309 100644 --- a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr +++ b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr @@ -5,7 +5,7 @@ LL | s.strip_suffix(b'\n').unwrap_or(s) | ^^^^^^^^^^^^ expected an `FnMut<(char,)>` closure, found `u8` | = help: the trait `FnMut<(char,)>` is not implemented for `u8` - = help: the following other types implement trait `Pattern<'a>`: + = help: the following other types implement trait `Pattern<&'a str>`: &'b String &'b [char; N] &'b [char] From 13fbed44b3a2d04c206d2d7a182477fcc3107446 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 14 Feb 2023 22:34:37 +0100 Subject: [PATCH 2/6] core: move Pattern et al to core::pattern module (RFC 2295) Pattern is no longer str-specific, so move it from core::str::pattern module to a new core::pattern module. This introduces no changes in behaviour or implementation. Just moves stuff around and adjusts documentation. Issue: https://github.com/rust-lang/rust/issues/49802 --- library/alloc/src/str.rs | 4 +- library/alloc/src/string.rs | 4 +- library/alloc/tests/str.rs | 8 +- library/core/src/lib.rs | 1 + library/core/src/pattern.rs | 349 ++++++++++++++++++++++++++++++ library/core/src/str/iter.rs | 3 +- library/core/src/str/mod.rs | 4 +- library/core/src/str/pattern.rs | 367 +++----------------------------- library/core/tests/pattern.rs | 2 +- library/std/src/lib.rs | 2 + 10 files changed, 389 insertions(+), 355 deletions(-) create mode 100644 library/core/src/pattern.rs diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index c1447fdc967f7..27d14d4c63b6e 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -11,7 +11,7 @@ use core::borrow::{Borrow, BorrowMut}; use core::iter::FusedIterator; use core::mem; use core::ptr; -use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; +use core::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use core::unicode::conversions; use crate::borrow::ToOwned; @@ -20,8 +20,6 @@ use crate::slice::{Concat, Join, SliceIndex}; use crate::string::String; use crate::vec::Vec; -#[stable(feature = "rust1", since = "1.0.0")] -pub use core::str::pattern; #[stable(feature = "encode_utf16", since = "1.8.0")] pub use core::str::EncodeUtf16; #[stable(feature = "split_ascii_whitespace", since = "1.34.0")] diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 9d41049d1a81e..ea4da19bce11a 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -57,7 +57,7 @@ use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ops::{self, Index, IndexMut, Range, RangeBounds}; use core::ptr; use core::slice; -use core::str::pattern::Pattern; +use core::pattern::Pattern; #[cfg(not(no_global_oom_handling))] use core::str::Utf8Chunks; @@ -1373,7 +1373,7 @@ impl String { where P: for<'x> Pattern<&'x str>, { - use core::str::pattern::Searcher; + use core::pattern::Searcher; let rejections = { let mut searcher = pat.into_searcher(self); diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index cc2110f5673a0..7ba183edc77f7 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1856,14 +1856,14 @@ fn test_repeat() { } mod pattern { - use std::str::pattern::SearchStep::{self, Done, Match, Reject}; - use std::str::pattern::{Pattern, ReverseSearcher, Searcher}; + use core::pattern::SearchStep::{self, Done, Match, Reject}; + use core::pattern::{Pattern, ReverseSearcher, Searcher}; macro_rules! make_test { ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { #[allow(unused_imports)] mod $name { - use std::str::pattern::SearchStep::{Match, Reject}; + use core::pattern::SearchStep::{Match, Reject}; use super::{cmp_search_to_vec}; #[test] fn fwd() { @@ -2139,7 +2139,7 @@ generate_iterator_test! { #[test] fn different_str_pattern_forwarding_lifetimes() { - use std::str::pattern::Pattern; + use core::pattern::Pattern; fn foo<'a, P>(p: P) where diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index dc0702c467a4e..14cfd8a119497 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -357,6 +357,7 @@ pub mod sync; pub mod fmt; pub mod hash; +pub mod pattern; pub mod slice; pub mod str; pub mod time; diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs new file mode 100644 index 0000000000000..225c6d2dba3ef --- /dev/null +++ b/library/core/src/pattern.rs @@ -0,0 +1,349 @@ +//! The Pattern API. +//! +//! The Pattern API provides a generic mechanism for using different pattern +//! types when searching through different objects. +//! +//! For more details, see the traits [`Pattern`], [`Haystack`], [`Searcher`], +//! [`ReverseSearcher`] and [`DoubleEndedSearcher`]. Although this API is +//! unstable, it is exposed via stable APIs on the [`str`] type. +//! +//! # Examples +//! +//! [`Pattern`] is [implemented][pattern-impls] in the stable API for +//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures +//! implementing `FnMut(char) -> bool`. +//! +//! ``` +//! let s = "Can you find a needle in a haystack?"; +//! +//! // &str pattern +//! assert_eq!(s.find("you"), Some(4)); +//! // char pattern +//! assert_eq!(s.find('n'), Some(2)); +//! // array of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); +//! // slice of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); +//! // closure pattern +//! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! ``` +//! +//! [pattern-impls]: Pattern#implementors + +#![unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] + +/// A pattern which can be matched against a [`Haystack`]. +/// +/// A `Pattern` expresses that the implementing type can be used as a pattern +/// for searching in a `H`. +/// +/// For example, character `'a'` and string `"aa"` are patterns that would match +/// at index `1` in the string `"baaaab"`. +/// +/// The trait itself acts as a builder for an associated +/// [`Searcher`] type, which does the actual work of finding +/// occurrences of the pattern in a string. +/// +/// Depending on the type of the pattern, the behaviour of methods like +/// [`str::find`] and [`str::contains`] can change. The table below describes +/// some of those behaviours. +/// +/// | Pattern type | Match condition | +/// |--------------------------|-------------------------------------------| +/// | `&str` | is substring | +/// | `char` | is contained in string | +/// | `&[char]` | any char in slice is contained in string | +/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +/// | `&&str` | is substring | +/// | `&String` | is substring | +/// +/// # Examples +/// +/// ``` +/// // &str +/// assert_eq!("abaaa".find("ba"), Some(1)); +/// assert_eq!("abaaa".find("bac"), None); +/// +/// // char +/// assert_eq!("abaaa".find('a'), Some(0)); +/// assert_eq!("abaaa".find('b'), Some(1)); +/// assert_eq!("abaaa".find('c'), None); +/// +/// // &[char; N] +/// assert_eq!("ab".find(&['b', 'a']), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd']), None); +/// +/// // &[char] +/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); +/// +/// // FnMut(char) -> bool +/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); +/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); +/// ``` +pub trait Pattern: Sized { + /// Associated searcher for this pattern + type Searcher: Searcher; + + /// Constructs the associated searcher from + /// `self` and the `haystack` to search in. + fn into_searcher(self, haystack: H) -> Self::Searcher; + + /// Checks whether the pattern matches anywhere in the haystack + fn is_contained_in(self, haystack: H) -> bool { + self.into_searcher(haystack).next_match().is_some() + } + + /// Checks whether the pattern matches at the front of the haystack + fn is_prefix_of(self, haystack: H) -> bool { + matches!( + self.into_searcher(haystack).next(), + SearchStep::Match(start, _) if start == haystack.cursor_at_front() + ) + } + + /// Checks whether the pattern matches at the back of the haystack + fn is_suffix_of(self, haystack: H) -> bool + where Self::Searcher: ReverseSearcher { + matches!( + self.into_searcher(haystack).next_back(), + SearchStep::Match(_, end) if end == haystack.cursor_at_back() + ) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: H) -> Option { + if let SearchStep::Match(start, end) = self.into_searcher(haystack).next() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!(start == haystack.cursor_at_front(), + "The first search step from Searcher \ + must include the first character"); + // SAFETY: `Searcher` is known to return valid indices. + Some(unsafe { haystack.split_at_cursor_unchecked(end) }.1) + } else { + None + } + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: H) -> Option + where Self::Searcher: ReverseSearcher { + if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!(end == haystack.cursor_at_back(), + "The first search step from ReverseSearcher \ + must include the last character"); + // SAFETY: `Searcher` is known to return valid indices. + Some(unsafe { haystack.split_at_cursor_unchecked(start) }.0) + } else { + None + } + } +} + + +/// A type which can be searched in using a [`Pattern`]. +/// +/// The trait is used in combination with [`Pattern`] trait to express a pattern +/// that can be used to search for elements in given haystack. +pub trait Haystack: Sized + Copy { + /// A cursor representing position in the haystack or its end. + type Cursor: Copy + PartialOrd; + + /// Returns cursor pointing at the beginning of the haystack. + fn cursor_at_front(&self) -> Self::Cursor; + + /// Returns cursor pointing at the end of the haystack. + fn cursor_at_back(&self) -> Self::Cursor; + + /// Splits haystack into two at given cursor position. + /// + /// Note that splitting a haystack isn’t guaranteed to preserve total + /// length. That is, each separate part’s length may be longer than length + /// of the original haystack. This property is preserved for `&str` and + /// `&[T]` haystacks but not for `&OsStr`. + unsafe fn split_at_cursor_unchecked(self, cursor: Self::Cursor) -> (Self, Self); +} + + +/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum SearchStep { + /// Expresses that a match of the pattern has been found at + /// `haystack[a..b]`. + Match(T, T), + /// Expresses that `haystack[a..b]` has been rejected as a possible match + /// of the pattern. + /// + /// Note that there might be more than one `Reject` between two `Match`es, + /// there is no requirement for them to be combined into one. + Reject(T, T), + /// Expresses that every byte of the haystack has been visited, ending + /// the iteration. + Done, +} + +/// A searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping +/// matches of a pattern starting from the front (left) of a string. +/// +/// It will be implemented by associated `Searcher` +/// types of the [`Pattern`] trait. +/// +/// The trait is marked unsafe because the indices returned by the +/// [`next()`][Searcher::next] methods are required to lie on valid utf8 +/// boundaries in the haystack. This enables consumers of this trait to +/// slice the haystack without additional runtime checks. +pub unsafe trait Searcher { + /// Getter for the underlying string to be searched in + /// + /// Will always return the same [`&str`][str]. + fn haystack(&self) -> H; + + /// Performs the next search step starting from the front. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches + /// the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can + /// not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has + /// been visited. + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` + /// might produce the stream + /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` + fn next(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges + /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return + /// `(start_match, end_match)`, where start_match is the index of where + /// the match begins, and end_match is the index after the end of the match. + fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] + /// and [`next_match()`][Searcher::next_match]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges + /// of this and [`next_match`][Searcher::next_match] will overlap. + fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A reverse searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping +/// matches of a pattern starting from the back (right) of a string. +/// +/// It will be implemented by associated [`Searcher`] +/// types of the [`Pattern`] trait if the pattern supports searching +/// for it from the back. +/// +/// The index ranges returned by this trait are not required +/// to exactly match those of the forward search in reverse. +/// +/// For the reason why this trait is marked unsafe, see the +/// parent trait [`Searcher`]. +pub unsafe trait ReverseSearcher: Searcher { + /// Performs the next search step starting from the back. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` + /// matches the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` + /// can not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack + /// has been visited + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` + /// might produce the stream + /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. + fn next_back(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next_back() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A marker trait to express that a [`ReverseSearcher`] +/// can be used for a [`DoubleEndedIterator`] implementation. +/// +/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need +/// to follow these conditions: +/// +/// - All results of `next()` need to be identical +/// to the results of `next_back()` in reverse order. +/// - `next()` and `next_back()` need to behave as +/// the two ends of a range of values, that is they +/// can not "walk past each other". +/// +/// # Examples +/// +/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a +/// [`char`] only requires looking at one at a time, which behaves the same +/// from both ends. +/// +/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because +/// the pattern `"aa"` in the haystack `"aaa"` matches as either +/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. +pub trait DoubleEndedSearcher: ReverseSearcher {} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index bdb1f1d297af6..b323a0709530b 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -7,11 +7,10 @@ use crate::iter::{Copied, Filter, FusedIterator, Map, TrustedLen}; use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; +use crate::pattern::{DoubleEndedSearcher, ReverseSearcher, Pattern, Searcher}; use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; -use super::pattern::Pattern; -use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; use super::validations::{next_code_point, next_code_point_reverse}; use super::LinesAnyMap; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index d2ad655b42af8..02bb1de1c522d 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -13,11 +13,9 @@ mod iter; mod traits; mod validations; -use self::pattern::Pattern; -use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; - use crate::char::{self, EscapeDebugExtArgs}; use crate::mem; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use crate::slice::{self, SliceIndex}; pub mod pattern; diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index b891fd9a4d337..f12f0c77f2207 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -1,36 +1,47 @@ -//! The string Pattern API. +//! [The Pattern API] implementation for searching in `&str`. //! -//! The Pattern API provides a generic mechanism for using different pattern -//! types when searching through a string. +//! The implementation provides generic mechanism for using different pattern +//! types when searching through a string. Although this API is unstable, it is +//! exposed via stable APIs on the [`str`] type. //! -//! For more details, see the traits [`Pattern`], [`Searcher`], -//! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. +//! Depending on the type of the pattern, the behaviour of methods like +//! [`str::find`] and [`str::contains`] can change. The table below describes +//! some of those behaviours. //! -//! Although this API is unstable, it is exposed via stable APIs on the -//! [`str`] type. +//! | Pattern type | Match condition | +//! |--------------------------|-------------------------------------------| +//! | `&str` | is substring | +//! | `char` | is contained in string | +//! | `&[char]` | any char in slice is contained in string | +//! | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +//! | `&&str` | is substring | +//! | `&String` | is substring | //! //! # Examples //! -//! [`Pattern`] is [implemented][pattern-impls] in the stable API for -//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures -//! implementing `FnMut(char) -> bool`. -//! //! ``` //! let s = "Can you find a needle in a haystack?"; //! //! // &str pattern //! assert_eq!(s.find("you"), Some(4)); +//! assert_eq!(s.find("thou"), None); +//! //! // char pattern //! assert_eq!(s.find('n'), Some(2)); -//! // array of chars pattern +//! assert_eq!(s.find('N'), None); +//! +//! // Array of chars pattern and slices thereof //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); -//! // slice of chars pattern //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); -//! // closure pattern +//! assert_eq!(s.find(&['q', 'v', 'x']), None); +//! +//! // Predicate closure //! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! assert_eq!(s.find(|c: char| c.is_lowercase()), Some(1)); +//! assert_eq!(s.find(|c: char| !c.is_ascii()), None); //! ``` //! -//! [pattern-impls]: Pattern#implementors +//! [The Pattern API]: crate::pattern #![unstable( feature = "pattern", @@ -41,333 +52,9 @@ use crate::cmp; use crate::cmp::Ordering; use crate::fmt; +use crate::pattern::{DoubleEndedSearcher, Haystack, Pattern, ReverseSearcher, Searcher, SearchStep}; use crate::slice::memchr; -// Pattern - -/// A pattern which can be matched against a [`Haystack`]. -/// -/// A `Pattern` expresses that the implementing type can be used as a pattern -/// for searching in a `H`. -/// -/// For example, character `'a'` and string `"aa"` are patterns that would match -/// at index `1` in the string `"baaaab"`. -/// -/// The trait itself acts as a builder for an associated -/// [`Searcher`] type, which does the actual work of finding -/// occurrences of the pattern in a string. -/// -/// Depending on the type of the pattern, the behaviour of methods like -/// [`str::find`] and [`str::contains`] can change. The table below describes -/// some of those behaviours. -/// -/// | Pattern type | Match condition | -/// |--------------------------|-------------------------------------------| -/// | `&str` | is substring | -/// | `char` | is contained in string | -/// | `&[char]` | any char in slice is contained in string | -/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | -/// | `&&str` | is substring | -/// | `&String` | is substring | -/// -/// # Examples -/// -/// ``` -/// // &str -/// assert_eq!("abaaa".find("ba"), Some(1)); -/// assert_eq!("abaaa".find("bac"), None); -/// -/// // char -/// assert_eq!("abaaa".find('a'), Some(0)); -/// assert_eq!("abaaa".find('b'), Some(1)); -/// assert_eq!("abaaa".find('c'), None); -/// -/// // &[char; N] -/// assert_eq!("ab".find(&['b', 'a']), Some(0)); -/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); -/// assert_eq!("abaaa".find(&['c', 'd']), None); -/// -/// // &[char] -/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); -/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); -/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); -/// -/// // FnMut(char) -> bool -/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); -/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); -/// ``` -pub trait Pattern: Sized { - /// Associated searcher for this pattern - type Searcher: Searcher; - - /// Constructs the associated searcher from - /// `self` and the `haystack` to search in. - fn into_searcher(self, haystack: H) -> Self::Searcher; - - /// Checks whether the pattern matches anywhere in the haystack - #[inline] - fn is_contained_in(self, haystack: H) -> bool { - self.into_searcher(haystack).next_match().is_some() - } - - /// Checks whether the pattern matches at the front of the haystack - #[inline] - fn is_prefix_of(self, haystack: H) -> bool { - matches!( - self.into_searcher(haystack).next(), - SearchStep::Match(start, _) if start == haystack.cursor_at_front() - ) - } - - /// Checks whether the pattern matches at the back of the haystack - #[inline] - fn is_suffix_of(self, haystack: H) -> bool - where Self::Searcher: ReverseSearcher { - matches!( - self.into_searcher(haystack).next_back(), - SearchStep::Match(_, end) if end == haystack.cursor_at_back() - ) - } - - /// Removes the pattern from the front of haystack, if it matches. - #[inline] - fn strip_prefix_of(self, haystack: H) -> Option { - if let SearchStep::Match(start, end) = self.into_searcher(haystack).next() { - // This cannot be debug_assert_eq because StartCursor isn’t Debug. - debug_assert!(start == haystack.cursor_at_front(), - "The first search step from Searcher \ - must include the first character"); - // SAFETY: `Searcher` is known to return valid indices. - Some(unsafe { haystack.split_at_cursor_unchecked(end) }.1) - } else { - None - } - } - - /// Removes the pattern from the back of haystack, if it matches. - #[inline] - fn strip_suffix_of(self, haystack: H) -> Option - where Self::Searcher: ReverseSearcher { - if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { - // This cannot be debug_assert_eq because StartCursor isn’t Debug. - debug_assert!(end == haystack.cursor_at_back(), - "The first search step from ReverseSearcher \ - must include the last character"); - // SAFETY: `Searcher` is known to return valid indices. - Some(unsafe { haystack.split_at_cursor_unchecked(start) }.0) - } else { - None - } - } -} - -// Haystack - -/// A type which can be searched in using a [`Pattern`]. -/// -/// The trait is used in combination with [`Pattern`] trait to express a pattern -/// that can be used to search for elements in given haystack. -pub trait Haystack: Sized + Copy { - /// A cursor representing position in the haystack or its end. - type Cursor: Copy + PartialOrd; - - /// Returns cursor pointing at the beginning of the haystack. - fn cursor_at_front(&self) -> Self::Cursor; - - /// Returns cursor pointing at the end of the haystack. - fn cursor_at_back(&self) -> Self::Cursor; - - /// Splits haystack into two at given cursor position. - /// - /// Note that splitting a haystack isn’t guaranteed to preserve total - /// length. That is, each separate part’s length may be longer than length - /// of the original haystack. This property is preserved for `&str` and - /// `&[T]` haystacks but not for `&OsStr`. - unsafe fn split_at_cursor_unchecked(self, cursor: Self::Cursor) -> (Self, Self); -} - -// Searcher - -/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum SearchStep { - /// Expresses that a match of the pattern has been found at - /// `haystack[a..b]`. - Match(T, T), - /// Expresses that `haystack[a..b]` has been rejected as a possible match - /// of the pattern. - /// - /// Note that there might be more than one `Reject` between two `Match`es, - /// there is no requirement for them to be combined into one. - Reject(T, T), - /// Expresses that every byte of the haystack has been visited, ending - /// the iteration. - Done, -} - -/// A searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the front (left) of a string. -/// -/// It will be implemented by associated `Searcher` -/// types of the [`Pattern`] trait. -/// -/// The trait is marked unsafe because the indices returned by the -/// [`next()`][Searcher::next] methods are required to lie on valid utf8 -/// boundaries in the haystack. This enables consumers of this trait to -/// slice the haystack without additional runtime checks. -pub unsafe trait Searcher { - /// Getter for the underlying string to be searched in - /// - /// Will always return the same [`&str`][str]. - fn haystack(&self) -> H; - - /// Performs the next search step starting from the front. - /// - /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches - /// the pattern. - /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can - /// not match the pattern, even partially. - /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has - /// been visited. - /// - /// The stream of [`Match`][SearchStep::Match] and - /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A [`Match`][SearchStep::Match] result needs to contain the whole matched - /// pattern, however [`Reject`][SearchStep::Reject] results may be split up - /// into arbitrary many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` - fn next(&mut self) -> SearchStep; - - /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. - /// - /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges - /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return - /// `(start_match, end_match)`, where start_match is the index of where - /// the match begins, and end_match is the index after the end of the match. - #[inline] - fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] - /// and [`next_match()`][Searcher::next_match]. - /// - /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges - /// of this and [`next_match`][Searcher::next_match] will overlap. - #[inline] - fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A reverse searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the back (right) of a string. -/// -/// It will be implemented by associated [`Searcher`] -/// types of the [`Pattern`] trait if the pattern supports searching -/// for it from the back. -/// -/// The index ranges returned by this trait are not required -/// to exactly match those of the forward search in reverse. -/// -/// For the reason why this trait is marked unsafe, see the -/// parent trait [`Searcher`]. -pub unsafe trait ReverseSearcher: Searcher { - /// Performs the next search step starting from the back. - /// - /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` - /// matches the pattern. - /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` - /// can not match the pattern, even partially. - /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack - /// has been visited - /// - /// The stream of [`Match`][SearchStep::Match] and - /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A [`Match`][SearchStep::Match] result needs to contain the whole matched - /// pattern, however [`Reject`][SearchStep::Reject] results may be split up - /// into arbitrary many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. - fn next_back(&mut self) -> SearchStep; - - /// Finds the next [`Match`][SearchStep::Match] result. - /// See [`next_back()`][ReverseSearcher::next_back]. - #[inline] - fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next [`Reject`][SearchStep::Reject] result. - /// See [`next_back()`][ReverseSearcher::next_back]. - #[inline] - fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next_back() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A marker trait to express that a [`ReverseSearcher`] -/// can be used for a [`DoubleEndedIterator`] implementation. -/// -/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need -/// to follow these conditions: -/// -/// - All results of `next()` need to be identical -/// to the results of `next_back()` in reverse order. -/// - `next()` and `next_back()` need to behave as -/// the two ends of a range of values, that is they -/// can not "walk past each other". -/// -/// # Examples -/// -/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a -/// [`char`] only requires looking at one at a time, which behaves the same -/// from both ends. -/// -/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because -/// the pattern `"aa"` in the haystack `"aaa"` matches as either -/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. -pub trait DoubleEndedSearcher: ReverseSearcher {} - ///////////////////////////////////////////////////////////////////////////// // Impl for Haystack ///////////////////////////////////////////////////////////////////////////// diff --git a/library/core/tests/pattern.rs b/library/core/tests/pattern.rs index d4bec996d89a1..0e943bd80ec7f 100644 --- a/library/core/tests/pattern.rs +++ b/library/core/tests/pattern.rs @@ -1,4 +1,4 @@ -use std::str::pattern::*; +use std::pattern::*; // This macro makes it easier to write // tests that do a series of iterations diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 363a266717467..e247185e1bf65 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -478,6 +478,8 @@ pub use core::mem; pub use core::ops; #[stable(feature = "rust1", since = "1.0.0")] pub use core::option; +#[unstable(feature = "pattern", issue = "27721")] +pub use core::pattern; #[stable(feature = "pin", since = "1.33.0")] pub use core::pin; #[stable(feature = "rust1", since = "1.0.0")] From 0ff2270bd81d54f6b4a90aed9d57db745e00cc8e Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Thu, 16 Feb 2023 01:15:44 +0100 Subject: [PATCH 3/6] core: add core::pattern::loop_next helper function Negative delta FTW. Issue: https://github.com/rust-lang/rust/issues/49802 --- library/core/src/pattern.rs | 48 ++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index 225c6d2dba3ef..955ddef854dec 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -237,13 +237,7 @@ pub unsafe trait Searcher { /// `(start_match, end_match)`, where start_match is the index of where /// the match begins, and end_match is the index after the end of the match. fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } + loop_next::(|| self.next()) } /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] @@ -252,13 +246,7 @@ pub unsafe trait Searcher { /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges /// of this and [`next_match`][Searcher::next_match] will overlap. fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } + loop_next::(|| self.next()) } } @@ -303,25 +291,13 @@ pub unsafe trait ReverseSearcher: Searcher { /// Finds the next [`Match`][SearchStep::Match] result. /// See [`next_back()`][ReverseSearcher::next_back]. fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } + loop_next::(|| self.next_back()) } /// Finds the next [`Reject`][SearchStep::Reject] result. /// See [`next_back()`][ReverseSearcher::next_back]. fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { - loop { - match self.next_back() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } + loop_next::(|| self.next_back()) } } @@ -347,3 +323,19 @@ pub unsafe trait ReverseSearcher: Searcher { /// the pattern `"aa"` in the haystack `"aaa"` matches as either /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. pub trait DoubleEndedSearcher: ReverseSearcher {} + + +/// Calls callback until it returns `SearchStep::Done` or either `Match` or +/// `Reject` depending no `MATCH` generic argument. +pub(super) fn loop_next( + mut next: impl FnMut() -> SearchStep, +) -> Option<(T, T)> { + loop { + match next() { + SearchStep::Done => break None, + SearchStep::Match(start, end) if MATCH => break Some((start, end)), + SearchStep::Reject(start, end) if !MATCH => break Some((start, end)), + _ => (), + } + } +} From 7af07335271bcc574c37b35ac3f862c67a59aab6 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 15 Feb 2023 03:19:43 +0100 Subject: [PATCH 4/6] core: implement Pattern<&[T]> for &[T]; get rid of SlicePattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement Haystack<&[T]> and corresponding Pattern<&[T]> for &[T]. That is, provide implementation for searching for subslices in slices. This replaces SlicePattern type. To make use of this new implementations, provide a few new methods on [T] type modelling them after types on str. Specifically, introduce {starts,ends}_with_pattern, find, rfind, [T]::{split,rsplit}_once and trim{,_start,_end}_matches. Note that due to existing starts_with and ends_with methods, the _pattern suffix had to be used. This is unfortunate but the type of starts_with’s argument cannot be changed without affecting type inference and thus breaking the API. This change doesn’t implement functions returning iterators such as split_pattern or matches which in str type are built on top of the Pattern API. Issue: https://github.com/rust-lang/rust/issues/49802 Issue: https://github.com/rust-lang/rust/issues/56345 --- library/core/src/slice/cmp.rs | 278 ++++++++++- library/core/src/slice/mod.rs | 331 +++++++++---- library/core/src/slice/pattern.rs | 777 ++++++++++++++++++++++++++++++ library/core/src/str/pattern.rs | 259 +--------- 4 files changed, 1294 insertions(+), 351 deletions(-) create mode 100644 library/core/src/slice/pattern.rs diff --git a/library/core/src/slice/cmp.rs b/library/core/src/slice/cmp.rs index 5e1b218e507bd..a9b0abae21fa5 100644 --- a/library/core/src/slice/cmp.rs +++ b/library/core/src/slice/cmp.rs @@ -227,34 +227,286 @@ impl_marker_for!(BytewiseEquality, u8 i8 u16 i16 u32 i32 u64 i64 u128 i128 usize isize char bool); pub(super) trait SliceContains: Sized { - fn slice_contains(&self, x: &[Self]) -> bool; + fn slice_contains_element(hs: &[Self], needle: &Self) -> bool; + fn slice_contains_slice(hs: &[Self], needle: &[Self]) -> bool; } impl SliceContains for T where T: PartialEq, { - default fn slice_contains(&self, x: &[Self]) -> bool { - x.iter().any(|y| *y == *self) + default fn slice_contains_element(hs: &[Self], needle: &Self) -> bool { + hs.iter().any(|element| *element == *needle) + } + + default fn slice_contains_slice(hs: &[Self], needle: &[Self]) -> bool { + default_slice_contains_slice(hs, needle) } } impl SliceContains for u8 { #[inline] - fn slice_contains(&self, x: &[Self]) -> bool { - memchr::memchr(*self, x).is_some() + fn slice_contains_element(hs: &[Self], needle: &Self) -> bool { + memchr::memchr(*needle, hs).is_some() + } + + #[inline] + fn slice_contains_slice(hs: &[Self], needle: &[Self]) -> bool { + if needle.len() <= 32 { + if let Some(result) = simd_contains(hs, needle) { + return result; + } + } + default_slice_contains_slice(hs, needle) } } +unsafe fn bytes_of(slice: &[T]) -> &[u8] { + // SAFETY: caller promises that `T` and `u8` have the same memory layout, + // thus casting `x.as_ptr()` as `*const u8` is safe. The `x.as_ptr()` comes + // from a reference and is thus guaranteed to be valid for reads for the + // length of the slice `x.len()`, which cannot be larger than + // `isize::MAX`. The returned slice is never mutated. + unsafe { from_raw_parts(slice.as_ptr() as *const u8, slice.len()) } +} + impl SliceContains for i8 { #[inline] - fn slice_contains(&self, x: &[Self]) -> bool { - let byte = *self as u8; - // SAFETY: `i8` and `u8` have the same memory layout, thus casting `x.as_ptr()` - // as `*const u8` is safe. The `x.as_ptr()` comes from a reference and is thus guaranteed - // to be valid for reads for the length of the slice `x.len()`, which cannot be larger - // than `isize::MAX`. The returned slice is never mutated. - let bytes: &[u8] = unsafe { from_raw_parts(x.as_ptr() as *const u8, x.len()) }; - memchr::memchr(byte, bytes).is_some() + fn slice_contains_element(hs: &[Self], needle: &Self) -> bool { + // SAFETY: i8 and u8 have the same memory layout + u8::slice_contains_element(unsafe { bytes_of(hs) }, &(*needle as u8)) + } + + #[inline] + fn slice_contains_slice(hs: &[Self], needle: &[Self]) -> bool { + // SAFETY: i8 and u8 have the same memory layout + unsafe { u8::slice_contains_slice(bytes_of(hs), bytes_of(needle)) } + } +} + +impl SliceContains for bool { + #[inline] + fn slice_contains_element(hs: &[Self], needle: &Self) -> bool { + // SAFETY: bool and u8 have the same memory layout and all valid bool + // bit patterns are valid u8 bit patterns. + u8::slice_contains_element(unsafe { bytes_of(hs) }, &(*needle as u8)) + } + + #[inline] + fn slice_contains_slice(hs: &[Self], needle: &[Self]) -> bool { + // SAFETY: bool and u8 have the same memory layout and all valid bool + // bit patterns are valid u8 bit patterns. + unsafe { u8::slice_contains_slice(bytes_of(hs), bytes_of(needle)) } + } +} + +fn default_slice_contains_slice(hs: &[T], needle: &[T]) -> bool { + super::pattern::NaiveSearcherState::new(hs.len()) + .next_match(hs, needle) + .is_some() +} + + +/// SIMD search for short needles based on +/// Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0] +/// +/// It skips ahead by the vector width on each iteration (rather than the needle length as two-way +/// does) by probing the first and last byte of the needle for the whole vector width +/// and only doing full needle comparisons when the vectorized probe indicated potential matches. +/// +/// Since the x86_64 baseline only offers SSE2 we only use u8x16 here. +/// If we ever ship std with for x86-64-v3 or adapt this for other platforms then wider vectors +/// should be evaluated. +/// +/// For haystacks smaller than vector-size + needle length it falls back to +/// a naive O(n*m) search so this implementation should not be called on larger needles. +/// +/// [0]: http://0x80.pl/articles/simd-strfind.html#sse-avx2 +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] +fn simd_contains(haystack: &[u8], needle: &[u8]) -> Option { + debug_assert!(needle.len() > 1); + + use crate::ops::BitAnd; + use crate::simd::mask8x16 as Mask; + use crate::simd::u8x16 as Block; + use crate::simd::{SimdPartialEq, ToBitMask}; + + let first_probe = needle[0]; + let last_byte_offset = needle.len() - 1; + + // the offset used for the 2nd vector + let second_probe_offset = if needle.len() == 2 { + // never bail out on len=2 needles because the probes will fully cover them and have + // no degenerate cases. + 1 + } else { + // try a few bytes in case first and last byte of the needle are the same + let Some(second_probe_offset) = (needle.len().saturating_sub(4)..needle.len()).rfind(|&idx| needle[idx] != first_probe) else { + // fall back to other search methods if we can't find any different bytes + // since we could otherwise hit some degenerate cases + return None; + }; + second_probe_offset + }; + + // do a naive search if the haystack is too small to fit + if haystack.len() < Block::LANES + last_byte_offset { + return Some(haystack.windows(needle.len()).any(|c| c == needle)); + } + + let first_probe: Block = Block::splat(first_probe); + let second_probe: Block = Block::splat(needle[second_probe_offset]); + // first byte are already checked by the outer loop. to verify a match only the + // remainder has to be compared. + let trimmed_needle = &needle[1..]; + + // this #[cold] is load-bearing, benchmark before removing it... + let check_mask = #[cold] + |idx, mask: u16, skip: bool| -> bool { + if skip { + return false; + } + + // and so is this. optimizations are weird. + let mut mask = mask; + + while mask != 0 { + let trailing = mask.trailing_zeros(); + let offset = idx + trailing as usize + 1; + // SAFETY: mask is between 0 and 15 trailing zeroes, we skip one additional byte that was already compared + // and then take trimmed_needle.len() bytes. This is within the bounds defined by the outer loop + unsafe { + let sub = haystack.get_unchecked(offset..).get_unchecked(..trimmed_needle.len()); + if small_slice_eq(sub, trimmed_needle) { + return true; + } + } + mask &= !(1 << trailing); + } + return false; + }; + + let test_chunk = |idx| -> u16 { + // SAFETY: this requires at least LANES bytes being readable at idx + // that is ensured by the loop ranges (see comments below) + let a: Block = unsafe { haystack.as_ptr().add(idx).cast::().read_unaligned() }; + // SAFETY: this requires LANES + block_offset bytes being readable at idx + let b: Block = unsafe { + haystack.as_ptr().add(idx).add(second_probe_offset).cast::().read_unaligned() + }; + let eq_first: Mask = a.simd_eq(first_probe); + let eq_last: Mask = b.simd_eq(second_probe); + let both = eq_first.bitand(eq_last); + let mask = both.to_bitmask(); + + return mask; + }; + + let mut i = 0; + let mut result = false; + // The loop condition must ensure that there's enough headroom to read LANE bytes, + // and not only at the current index but also at the index shifted by block_offset + const UNROLL: usize = 4; + while i + last_byte_offset + UNROLL * Block::LANES < haystack.len() && !result { + let mut masks = [0u16; UNROLL]; + for j in 0..UNROLL { + masks[j] = test_chunk(i + j * Block::LANES); + } + for j in 0..UNROLL { + let mask = masks[j]; + if mask != 0 { + result |= check_mask(i + j * Block::LANES, mask, result); + } + } + i += UNROLL * Block::LANES; + } + while i + last_byte_offset + Block::LANES < haystack.len() && !result { + let mask = test_chunk(i); + if mask != 0 { + result |= check_mask(i, mask, result); + } + i += Block::LANES; + } + + // Process the tail that didn't fit into LANES-sized steps. + // This simply repeats the same procedure but as right-aligned chunk instead + // of a left-aligned one. The last byte must be exactly flush with the string end so + // we don't miss a single byte or read out of bounds. + let i = haystack.len() - last_byte_offset - Block::LANES; + let mask = test_chunk(i); + if mask != 0 { + result |= check_mask(i, mask, result); + } + + Some(result) +} + +/// Compares short slices for equality. +/// +/// It avoids a call to libc's memcmp which is faster on long slices +/// due to SIMD optimizations but it incurs a function call overhead. +/// +/// # Safety +/// +/// Both slices must have the same length. +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] // only called on x86 +#[inline] +unsafe fn small_slice_eq(x: &[u8], y: &[u8]) -> bool { + debug_assert_eq!(x.len(), y.len()); + // This function is adapted from + // https://github.com/BurntSushi/memchr/blob/8037d11b4357b0f07be2bb66dc2659d9cf28ad32/src/memmem/util.rs#L32 + + // If we don't have enough bytes to do 4-byte at a time loads, then + // fall back to the naive slow version. + // + // Potential alternative: We could do a copy_nonoverlapping combined with a mask instead + // of a loop. Benchmark it. + if x.len() < 4 { + for (&b1, &b2) in x.iter().zip(y) { + if b1 != b2 { + return false; + } + } + return true; + } + // When we have 4 or more bytes to compare, then proceed in chunks of 4 at + // a time using unaligned loads. + // + // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is + // that this particular version of memcmp is likely to be called with tiny + // needles. That means that if we do 8 byte loads, then a higher proportion + // of memcmp calls will use the slower variant above. With that said, this + // is a hypothesis and is only loosely supported by benchmarks. There's + // likely some improvement that could be made here. The main thing here + // though is to optimize for latency, not throughput. + + // SAFETY: Via the conditional above, we know that both `px` and `py` + // have the same length, so `px < pxend` implies that `py < pyend`. + // Thus, derefencing both `px` and `py` in the loop below is safe. + // + // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual + // end of `px` and `py`. Thus, the final dereference outside of the + // loop is guaranteed to be valid. (The final comparison will overlap with + // the last comparison done in the loop for lengths that aren't multiples + // of four.) + // + // Finally, we needn't worry about alignment here, since we do unaligned + // loads. + unsafe { + let (mut px, mut py) = (x.as_ptr(), y.as_ptr()); + let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4)); + while px < pxend { + let vx = (px as *const u32).read_unaligned(); + let vy = (py as *const u32).read_unaligned(); + if vx != vy { + return false; + } + px = px.add(4); + py = py.add(4); + } + let vx = (pxend as *const u32).read_unaligned(); + let vy = (pyend as *const u32).read_unaligned(); + vx == vy } } diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index 6ea16bf643071..0ffd2eb285384 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -15,6 +15,7 @@ use crate::num::NonZeroUsize; use crate::ops::{Bound, FnMut, OneSidedRange, Range, RangeBounds}; use crate::option::Option; use crate::option::Option::{None, Some}; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use crate::ptr; use crate::result::Result; use crate::result::Result::{Err, Ok}; @@ -40,6 +41,7 @@ mod ascii; mod cmp; mod index; mod iter; +mod pattern; mod raw; mod rotate; mod specialize; @@ -2213,11 +2215,14 @@ impl [T] { RSplitNMut::new(self.rsplit_mut(pred), n) } - /// Returns `true` if the slice contains an element with the given value. + /// Returns `true` if the slice contains given pattern; returns `false` + /// otherwise. /// - /// This operation is *O*(*n*). + /// This may be used to look for a single element (in which case the + /// operation is *O*(*n*)) or with more complex patterns. /// - /// Note that if you have a sorted slice, [`binary_search`] may be faster. + /// Note that if you have a sorted slice and are looking for a single + /// element, [`binary_search`] may be faster. /// /// [`binary_search`]: slice::binary_search /// @@ -2227,11 +2232,15 @@ impl [T] { /// let v = [10, 40, 30]; /// assert!(v.contains(&30)); /// assert!(!v.contains(&50)); + /// + /// assert!(v.contains(&[])); + /// assert!(v.contains(&[40, 30])); + /// assert!(!v.contains(&[30, 40])); /// ``` /// - /// If you do not have a `&T`, but some other value that you can compare - /// with one (for example, `String` implements `PartialEq`), you can - /// use `iter().any`: + /// If you’re looking for a single element and don’t have a `&T`, but some + /// other value that you can compare with one (for example, `String` + /// implements `PartialEq`), you can use `iter().any`: /// /// ``` /// let v = [String::from("hello"), String::from("world")]; // slice of `String` @@ -2241,44 +2250,42 @@ impl [T] { #[stable(feature = "rust1", since = "1.0.0")] #[inline] #[must_use] - pub fn contains(&self, x: &T) -> bool - where - T: PartialEq, - { - cmp::SliceContains::slice_contains(x, self) + pub fn contains<'a, P: Pattern<&'a [T]>>(&'a self, pat: P) -> bool { + pat.is_contained_in(self) } - /// Returns `true` if `needle` is a prefix of the slice. + /// Returns `true` if `pattern` matches at the beginning of the slice. /// /// # Examples /// /// ``` /// let v = [10, 40, 30]; + /// + /// assert!(v.starts_with(&[])); /// assert!(v.starts_with(&[10])); /// assert!(v.starts_with(&[10, 40])); /// assert!(!v.starts_with(&[50])); /// assert!(!v.starts_with(&[10, 50])); + /// + /// assert!(v.starts_with(&10)); + /// assert!(!v.starts_with(&30)); /// ``` /// - /// Always returns `true` if `needle` is an empty slice: + /// Always returns `true` if `pattern` is an empty slice: /// /// ``` /// let v = &[10, 40, 30]; - /// assert!(v.starts_with(&[])); + /// assert!(v.ends_with(&[])); /// let v: &[u8] = &[]; - /// assert!(v.starts_with(&[])); + /// assert!(v.ends_with(&[])); /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[must_use] - pub fn starts_with(&self, needle: &[T]) -> bool - where - T: PartialEq, - { - let n = needle.len(); - self.len() >= n && needle == &self[..n] + pub fn starts_with<'a, P: Pattern<&'a [T]>>(&'a self, pattern: P) -> bool { + pattern.is_prefix_of(self) } - /// Returns `true` if `needle` is a suffix of the slice. + /// Returns `true` if `pattern` matches at the end of the slice. /// /// # Examples /// @@ -2288,9 +2295,12 @@ impl [T] { /// assert!(v.ends_with(&[40, 30])); /// assert!(!v.ends_with(&[50])); /// assert!(!v.ends_with(&[50, 30])); + /// + /// assert!(v.ends_with(&30)); + /// assert!(!v.ends_with(&10)); /// ``` /// - /// Always returns `true` if `needle` is an empty slice: + /// Always returns `true` if `pattern` is an empty slice: /// /// ``` /// let v = &[10, 40, 30]; @@ -2300,20 +2310,15 @@ impl [T] { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[must_use] - pub fn ends_with(&self, needle: &[T]) -> bool - where - T: PartialEq, - { - let (m, n) = (self.len(), needle.len()); - m >= n && needle == &self[m - n..] + pub fn ends_with<'a, P>(&'a self, pattern: P) -> bool + where P: Pattern<&'a [T], Searcher: ReverseSearcher<&'a [T]>> { + pattern.is_suffix_of(self) } /// Returns a subslice with the prefix removed. /// - /// If the slice starts with `prefix`, returns the subslice after the prefix, wrapped in `Some`. - /// If `prefix` is empty, simply returns the original slice. - /// - /// If the slice does not start with `prefix`, returns `None`. + /// If `prefix` matches at the beginning of the slice, returns the subslice + /// after the prefix, wrapped in `Some`. Otherwise returns `None`. /// /// # Examples /// @@ -2324,34 +2329,20 @@ impl [T] { /// assert_eq!(v.strip_prefix(&[50]), None); /// assert_eq!(v.strip_prefix(&[10, 50]), None); /// - /// let prefix : &str = "he"; - /// assert_eq!(b"hello".strip_prefix(prefix.as_bytes()), + /// let prefix: &[u8] = b"he"; + /// assert_eq!(b"hello".strip_prefix(prefix), /// Some(b"llo".as_ref())); /// ``` #[must_use = "returns the subslice without modifying the original"] #[stable(feature = "slice_strip", since = "1.51.0")] - pub fn strip_prefix + ?Sized>(&self, prefix: &P) -> Option<&[T]> - where - T: PartialEq, - { - // This function will need rewriting if and when SlicePattern becomes more sophisticated. - let prefix = prefix.as_slice(); - let n = prefix.len(); - if n <= self.len() { - let (head, tail) = self.split_at(n); - if head == prefix { - return Some(tail); - } - } - None + pub fn strip_prefix<'a, P: Pattern<&'a [T]>>(&'a self, prefix: P) -> Option<&'a [T]> { + prefix.strip_prefix_of(self) } /// Returns a subslice with the suffix removed. /// - /// If the slice ends with `suffix`, returns the subslice before the suffix, wrapped in `Some`. - /// If `suffix` is empty, simply returns the original slice. - /// - /// If the slice does not end with `suffix`, returns `None`. + /// If `suffix` matches at the end of the slice, returns the subslice before + /// the suffix, wrapped in `Some`. Otherwise returns `None`. /// /// # Examples /// @@ -2364,20 +2355,200 @@ impl [T] { /// ``` #[must_use = "returns the subslice without modifying the original"] #[stable(feature = "slice_strip", since = "1.51.0")] - pub fn strip_suffix + ?Sized>(&self, suffix: &P) -> Option<&[T]> + pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a [T]> where - T: PartialEq, + P: Pattern<&'a [T]>, +

>::Searcher: ReverseSearcher<&'a [T]>, { - // This function will need rewriting if and when SlicePattern becomes more sophisticated. - let suffix = suffix.as_slice(); - let (len, n) = (self.len(), suffix.len()); - if n <= len { - let (head, tail) = self.split_at(len - n); - if tail == suffix { - return Some(head); - } + suffix.strip_suffix_of(self) + } + + /// Returns index of the first occurrence of the specified `pattern` in the + /// slice. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let nums = &[10, 40, 30, 40]; + /// assert_eq!(nums.find(&40), Some(1)); + /// assert_eq!(nums.find(&[40, 30]), Some(1)); + /// assert_eq!(nums.find(&42), None); + /// + /// let s = b"The swift brown fox"; + /// + /// assert_eq!(s.find(b"w"), Some(5)); + /// assert_eq!(s.find(&b'w'), Some(5)); + /// assert_eq!(s.find(b"swift"), Some(4)); + /// assert_eq!(s.find(b"slow"), None); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn find<'a, P: Pattern<&'a [T]>>(&'a self, pattern: P) -> Option { + pattern.into_searcher(self).next_match().map(|(i, _)| i) + } + + /// Returns index of the last occurrence of the specified `pattern` in the + /// slice. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let nums = &[10, 40, 30, 40]; + /// assert_eq!(nums.find(&40), Some(1)); + /// assert_eq!(nums.find(&[40, 30]), Some(1)); + /// assert_eq!(nums.find(&42), None); + /// + /// let s = b"The swift brown fox"; + /// + /// assert_eq!(s.rfind(b"w"), Some(13)); + /// assert_eq!(s.rfind(&b'w'), Some(13)); + /// assert_eq!(s.rfind(b"swift"), Some(4)); + /// assert_eq!(s.rfind(b"slow"), None); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn rfind<'a, P>(&'a self, pat: P) -> Option + where + P: Pattern<&'a [T], Searcher: ReverseSearcher<&'a [T]>>, + { + pat.into_searcher(self).next_match_back().map(|(i, _)| i) + } + + /// Splits the slice on the first occurrence of the specified `delimiter` + /// [pattern] and returns prefix before delimiter and suffix after delimiter. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let s = b"Durarara"; + /// + /// assert_eq!(s.split_once(b"ra"), Some((&b"Du"[..], &b"rara"[..]))); + /// assert_eq!(s.split_once(b"!"), None); + /// ``` + /// + /// [pattern]: crate::slice::pattern + #[unstable(feature = "pattern", issue = "27721")] + pub fn split_once<'a, P: Pattern<&'a [T]>>(&'a self, delimiter: P) -> Option<(&'a [T], &'a [T])> { + let (start, end) = delimiter.into_searcher(self).next_match()?; + // SAFETY: `Searcher` is known to return valid indices. + unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } + } + + /// Splits the slice on the last occurrence of the specified `delimiter` + /// [pattern] and returns prefix before delimiter and suffix after delimiter. + /// + /// Returns [`None`] if the pattern doesn't match. + /// + /// # Examples + /// + /// Simple patterns: + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let s = b"Durarara"; + /// + /// assert_eq!(s.rsplit_once(b"ra"), Some((&b"Durara"[..], &b""[..]))); + /// assert_eq!(s.rsplit_once(b"!"), None); + /// ``` + /// + /// [pattern]: crate::slice::pattern + #[unstable(feature = "pattern", issue = "27721")] + pub fn rsplit_once<'a, P>(&'a self, delimiter: P) -> Option<(&'a [T], &'a [T])> + where + P: Pattern<&'a [T], Searcher: ReverseSearcher<&'a [T]>>, + { + let (start, end) = delimiter.into_searcher(self).next_match_back()?; + // SAFETY: `Searcher` is known to return valid indices. + unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } + } + + /// Returns a slice with all prefixes and suffixes that match the `pattern` + /// repeatedly removed. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let s = b"111foo1bar111".as_ref(); + /// assert_eq!(s.trim_matches(&b'1'), &b"foo1bar"[..]); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a [T] + where + P: Pattern<&'a [T], Searcher: DoubleEndedSearcher<&'a [T]>>, + { + let mut i = 0; + let mut j = 0; + let mut matcher = pat.into_searcher(self); + if let Some((a, b)) = matcher.next_reject() { + i = a; + j = b; // Remember earliest known match, correct it below if + // last match is different + } + if let Some((_, b)) = matcher.next_reject_back() { + j = b; } - None + // SAFETY: `Searcher` is known to return valid indices. + unsafe { self.get_unchecked(i..j) } + } + + /// XXX TODO placeholder + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let s = b"111foo1bar111".as_ref(); + /// assert_eq!(s.trim_start_matches(&b'1'), &b"foo1bar111"[..]); + /// assert_eq!(s.trim_start_matches(b"11".as_ref()), &b"1foo1bar111"[..]); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn trim_start_matches<'a, P: Pattern<&'a [T]>>(&'a self, pat: P) -> &'a [T] { + let mut i = self.len(); + let mut matcher = pat.into_searcher(self); + if let Some((a, _)) = matcher.next_reject() { + i = a; + } + // SAFETY: `Searcher` is known to return valid indices. + unsafe { self.get_unchecked(i..self.len()) } + } + + /// XXX TODO placeholder + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// + /// let s = b"111foo1bar111".as_ref(); + /// assert_eq!(s.trim_end_matches(&b'1'), &b"111foo1bar"[..]); + /// assert_eq!(s.trim_end_matches(b"11".as_ref()), &b"111foo1bar1"[..]); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn trim_end_matches<'a, P>(&'a self, pat: P) -> &'a [T] + where + P: Pattern<&'a [T], Searcher: ReverseSearcher<&'a [T]>>, + { + let mut j = 0; + let mut matcher = pat.into_searcher(self); + if let Some((_, b)) = matcher.next_reject_back() { + j = b; + } + // SAFETY: `Searcher` is known to return valid indices. + unsafe { self.get_unchecked(0..j) } } /// Binary searches this slice for a given element. @@ -4407,38 +4578,6 @@ impl const Default for &mut [T] { } } -#[unstable(feature = "slice_pattern", reason = "stopgap trait for slice patterns", issue = "56345")] -/// Patterns in slices - currently, only used by `strip_prefix` and `strip_suffix`. At a future -/// point, we hope to generalise `core::str::Pattern` (which at the time of writing is limited to -/// `str`) to slices, and then this trait will be replaced or abolished. -pub trait SlicePattern { - /// The element type of the slice being matched on. - type Item; - - /// Currently, the consumers of `SlicePattern` need a slice. - fn as_slice(&self) -> &[Self::Item]; -} - -#[stable(feature = "slice_strip", since = "1.51.0")] -impl SlicePattern for [T] { - type Item = T; - - #[inline] - fn as_slice(&self) -> &[Self::Item] { - self - } -} - -#[stable(feature = "slice_strip", since = "1.51.0")] -impl SlicePattern for [T; N] { - type Item = T; - - #[inline] - fn as_slice(&self) -> &[Self::Item] { - self - } -} - /// This checks every index against each other, and against `len`. /// /// This will do `binomial(N + 1, 2) = N * (N + 1) / 2 = 0, 1, 3, 6, 10, ..` diff --git a/library/core/src/slice/pattern.rs b/library/core/src/slice/pattern.rs new file mode 100644 index 0000000000000..b0c4ddd8bbbaa --- /dev/null +++ b/library/core/src/slice/pattern.rs @@ -0,0 +1,777 @@ +#![unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] + +use crate::pattern::{Haystack, Pattern, SearchStep}; +use crate::pattern; + +use super::cmp::SliceContains; + +///////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +///////////////////////////////////////////////////////////////////////////// + +impl<'a, T> Haystack for &'a [T] { + type Cursor = usize; + + fn cursor_at_front(&self) -> usize { 0 } + fn cursor_at_back(&self) -> usize { self.len() } + + unsafe fn split_at_cursor_unchecked(self, pos: usize) -> (Self, Self) { + // SAFETY: Caller promises cursor is valid. + unsafe { (self.get_unchecked(..pos), self.get_unchecked(pos..)) } + } +} + +///////////////////////////////////////////////////////////////////////////// +// Impl Pattern for &T +///////////////////////////////////////////////////////////////////////////// + +/// Pattern implementation for searching for an element in a slice. +/// +/// The pattern matches a single element in a slice. +/// +/// # Examples +/// +/// ``` +/// # #![feature(pattern)] +/// +/// let nums = &[10, 40, 30, 40]; +/// assert_eq!(nums.find(&40), Some(1)); +/// assert_eq!(nums.find(&42), None); +/// ``` +impl<'hs, 'p, T: PartialEq> Pattern<&'hs [T]> for &'p T { + type Searcher = ElementSearcher<'hs, 'p, T>; + + fn into_searcher(self, haystack: &'hs [T]) -> Self::Searcher { + // TODO: We probably should specialise this for u8 and i8 the same way + // we specialise SliceContains + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs [T]) -> bool { + T::slice_contains_element(haystack, self) + } + + fn is_prefix_of(self, haystack: &'hs [T]) -> bool { + haystack.first() == Some(self) + } + + fn is_suffix_of(self, haystack: &'hs [T]) -> bool { + haystack.last() == Some(self) + } + + fn strip_prefix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + match haystack.split_first() { + Some((first, tail)) if first == self => Some(tail), + _ => None, + } + } + + fn strip_suffix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + match haystack.split_last() { + Some((last, head)) if last == self => Some(head), + _ => None, + } + } +} + +#[derive(Clone, Debug)] +pub struct ElementSearcher<'hs, 'p, T> { + /// Haystack we’re searching in. + haystack: &'hs [T], + /// Element we’re searching for. + needle: &'p T, + /// Internal state of the searcher. + state: PredicateSearchState, +} + +impl<'hs, 'p, T> ElementSearcher<'hs, 'p, T> { + fn new(haystack: &'hs [T], needle: &'p T) -> Self { + Self { + haystack, + needle, + state: PredicateSearchState::new(haystack.len()) + } + } +} + +unsafe impl<'hs, 'p, T: PartialEq> pattern::Searcher<&'hs [T]> for ElementSearcher<'hs, 'p, T> { + fn haystack(&self) -> &'hs [T] { self.haystack } + + fn next(&mut self) -> SearchStep { + self.state.next(self.haystack, &mut |element| element == self.needle) + } + + fn next_match(&mut self) -> Option<(usize, usize)> { + self.state.next_match(self.haystack, &mut |element| element == self.needle) + } + + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.state.next_reject(self.haystack, &mut |element| element == self.needle) + } +} + +unsafe impl<'hs, 'p, T: PartialEq> pattern::ReverseSearcher<&'hs [T]> for ElementSearcher<'hs, 'p, T> { + fn next_back(&mut self) -> SearchStep { + self.state.next_back(self.haystack, &mut |element| element == self.needle) + } + + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.state.next_match_back(self.haystack, &mut |element| element == self.needle) + } + + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.state.next_reject_back(self.haystack, &mut |element| element == self.needle) + } +} + +impl<'hs, 'p, T: PartialEq> pattern::DoubleEndedSearcher<&'hs [T]> for ElementSearcher<'hs, 'p, T> {} + +///////////////////////////////////////////////////////////////////////////// +// Impl Pattern for &mut FnMut(&T) and &mut FnMut(T) +///////////////////////////////////////////////////////////////////////////// + +/* + + XXX TODO those don’t actually work because the implementations conflict with + implementation for &T. This is actually kind of a pain. It may mean that we + will need some kind of core::pattern::Pred wrapper. I think that would work + then. + + +/// Pattern implementation for searching for an element matching given +/// predicate. +/// +/// # Examples +/// +/// ``` +/// # #![feature(pattern)] +/// +/// let nums = &[10, 40, 30, 40]; +/// assert_eq!(nums.find(|n| n % 3 == 0), Some(2)); +/// assert_eq!(nums.find(|n| n % 2 == 1), None); +/// ``` +impl<'hs, 'p, T, F: FnMut(&T) -> bool> Pattern<&'hs [T]> for &'p F { + type Searcher = PredicateSearcher<'hs, 'p, T, F>; + + fn into_searcher(self, haystack: &'hs [T]) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs [T]) -> bool { + haystack.iter().any(self) + } + + fn is_prefix_of(self, haystack: &'hs [T]) -> bool { + haystack.first().filter(self).is_some() + } + + fn is_suffix_of(self, haystack: &'hs [T]) -> bool { + haystack.last().filter(self).is_some() + } + + fn strip_prefix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + match haystack.split_first() { + Some((first, tail)) if self(first) => Some(tail), + _ => None, + } + } + + fn strip_suffix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + match haystack.split_last() { + Some((last, head)) if self(last) => Some(head), + _ => None, + } + } +} + +pub struct PredicateSearcher<'hs, 'p, T, F> { + /// Haystack we’re searching in. + haystack: &'hs [T], + /// Predicate used to match elements. + pred: &'p mut F, + /// Internal state of the searcher. + state: PredicateSearchState, +} + +impl<'hs, 'p, T, F> PredicateSearcher<'hs, 'p, T, F> { + fn new(haystack: &'hs [T], pred: &mut F) -> Self { + let state = PredicateSearchState::new(haystack.len()); + Self { haystack, pred, state } + } +} + +unsafe impl<'hs, 'p, T, F: FnMut(&T) -> bool> pattern::Searcher<&'hs [T]> for PredicateSearcher<'hs, 'p, T, F> { + fn haystack(&self) -> &'hs [T] { self.haystack } + + fn next(&mut self) -> SearchStep { + self.state.next(|idx| self.pred(&self.haystack[idx])) + } + + fn next_match(&mut self) -> Option<(usize, usize)> { + self.state.next_match(|idx| self.pred(&self.haystack[idx])) + } + + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.state.next_reject(|idx| self.pred(&self.haystack[idx])) + } +} + +unsafe impl<'hs, 'p, T, F: FnMut(&T) -> bool> pattern::ReverseSearcher<&'hs [T]> for PredicateSearcher<'hs, 'p, T, F> { + fn next_back(&mut self) -> SearchStep { + self.state.next_back(|idx| self.pred(&self.haystack[idx])) + } + + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.state.next_match_back(|idx| self.pred(&self.haystack[idx])) + } + + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.state.next_reject_back(|idx| self.pred(&self.haystack[idx])) + } +} + +*/ + +///////////////////////////////////////////////////////////////////////////// +// Impl Pattern for &[T] and &[T; N] +///////////////////////////////////////////////////////////////////////////// + +/// Pattern implementation for searching a subslice in a slice. +/// +/// The pattern matches a subslice of a larger slice. An empty pattern matches +/// around every character in a slice. +/// +/// Note: Other than with slice patterns matching `str`, this pattern matches +/// a subslice rather than a single element of haystack being equal to element +/// of the pattern. +/// +/// # Examples +/// +/// ``` +/// # #![feature(pattern)] +/// use core::pattern::{Pattern, Searcher}; +/// +/// // Simple usage +/// let nums: &[i32] = &[10, 40, 30, 40]; +/// assert_eq!(nums.find(&[40]), Some(1)); +/// assert_eq!(nums.find(&[40, 30]), Some(1)); +/// assert_eq!(nums.find(&[42, 30]), None); +/// +/// // Empty pattern +/// let empty: &[i32] = &[]; +/// let mut s = empty.into_searcher(nums); +/// assert_eq!(s.next_match(), Some((0, 0))); +/// assert_eq!(s.next_match(), Some((1, 1))); +/// assert_eq!(s.next_match(), Some((2, 2))); +/// assert_eq!(s.next_match(), Some((3, 3))); +/// assert_eq!(s.next_match(), Some((4, 4))); +/// assert_eq!(s.next_match(), None); +/// +/// // Difference with str patterns. +/// assert_eq!("Foo".find(&['f', 'o']), Some(1)); +/// // -- "Foo" contains letter 'o' at index 1. +/// assert_eq!(b"Foo".find(&[b'f', b'o']), None); +/// // -- b"Foo" doesn’t contain subslice b"fo". +/// ``` +impl<'hs, 'p, T: PartialEq> Pattern<&'hs [T]> for &'p [T] { + type Searcher = Searcher<'hs, 'p, T>; + + fn into_searcher(self, haystack: &'hs [T]) -> Self::Searcher { + Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs [T]) -> bool { + if self.len() == 0 { + true + } else if self.len() == 1 { + T::slice_contains_element(haystack, &self[0]) + } else if self.len() < haystack.len() { + T::slice_contains_slice(haystack, self) + } else if self.len() == haystack.len() { + self == haystack + } else { + false + } + } + #[inline] + fn is_prefix_of(self, haystack: &'hs [T]) -> bool { + haystack.get(..self.len()).map_or(false, |prefix| prefix == self) + } + + + #[inline] + fn is_suffix_of(self, haystack: &'hs [T]) -> bool { + haystack + .len() + .checked_sub(self.len()) + .map_or(false, |n| &haystack[n..] == self) + } + + #[inline] + fn strip_prefix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + self.is_prefix_of(haystack).then(|| { + // SAFETY: prefix was just verified to exist. + unsafe { haystack.get_unchecked(self.len()..) } + }) + } + + #[inline] + fn strip_suffix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + self.is_suffix_of(haystack).then(|| { + let n = haystack.len() - self.len(); + // SAFETY: suffix was just verified to exist. + unsafe { haystack.get_unchecked(..n) } + }) + } +} + +/// Pattern implementation for searching a subslice in a slice. +/// +/// This is identical to a slice pattern: the pattern matches a subslice of +/// a larger slice. An empty array matches around every character in a slice. +/// +/// Note: Other than with slice patterns matching `str`, this pattern matches +/// a subslice rather than a single element of haystack being equal to element +/// of the pattern. +/// +/// # Examples +/// +/// ``` +/// # #![feature(pattern)] +/// +/// let slice: &[u8] = b"The quick brown fox"; +/// assert_eq!(slice.find(b"quick"), Some(4)); +/// assert_eq!(slice.find(b"slow"), None); +/// assert_eq!(slice.find(b""), Some(0)); +/// ``` +impl<'hs, 'p, T: PartialEq, const N: usize> Pattern<&'hs [T]> for &'p [T; N] { + type Searcher = Searcher<'hs, 'p, T>; + + fn into_searcher(self, haystack: &'hs [T]) -> Searcher<'hs, 'p, T> { + Searcher::new(haystack, &self[..]) + } + + #[inline(always)] + fn is_contained_in(self, haystack: &'hs [T]) -> bool { + (&self[..]).is_contained_in(haystack) + } + + #[inline(always)] + fn is_prefix_of(self, haystack: &'hs [T]) -> bool { + (&self[..]).is_prefix_of(haystack) + } + + #[inline(always)] + fn is_suffix_of(self, haystack: &'hs [T]) -> bool { + (&self[..]).is_suffix_of(haystack) + } + + #[inline(always)] + fn strip_prefix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + (&self[..]).strip_prefix_of(haystack) + } + + #[inline(always)] + fn strip_suffix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + (&self[..]).strip_suffix_of(haystack) + } +} + +#[derive(Clone, Debug)] +/// Associated type for `<&'p [T] as Pattern<&'hs [T]>>::Searcher`. +pub struct Searcher<'hs, 'p, T> { + /// Haystack we’re searching in. + haystack: &'hs [T], + /// Subslice we’re searching for. + needle: &'p [T], + /// Internal state of the searcher. + state: SearcherState, +} + +#[derive(Clone, Debug)] +enum SearcherState { + Empty(EmptySearcherState), + Element(PredicateSearchState), + Naive(NaiveSearcherState), +} + +impl<'hs, 'p, T: PartialEq> Searcher<'hs, 'p, T> { + fn new(haystack: &'hs [T], needle: &'p [T]) -> Searcher<'hs, 'p, T> { + let state = match needle.len() { + 0 => SearcherState::Empty(EmptySearcherState::new(haystack.len())), + 1 => SearcherState::Element(PredicateSearchState::new(haystack.len())), + _ => SearcherState::Naive(NaiveSearcherState::new(haystack.len())), + }; + Searcher { haystack, needle, state } + } +} + +macro_rules! delegate { + ($method:ident -> $ret:ty) => { + fn $method(&mut self) -> $ret { + match &mut self.state { + SearcherState::Empty(state) => state.$method(), + SearcherState::Element(state) => state.$method(self.haystack, &mut |element| { + // SAFETY: SearcherState::Element is created if and only if + // needle.len() == 1. + element == unsafe { self.needle.get_unchecked(0) } + }), + SearcherState::Naive(state) => state.$method(self.haystack, self.needle), + } + } + } +} + +unsafe impl<'hs, 'p, T: PartialEq> pattern::Searcher<&'hs [T]> for Searcher<'hs, 'p, T> { + fn haystack(&self) -> &'hs [T] { + self.haystack + } + + delegate!(next -> SearchStep); + delegate!(next_match -> Option<(usize, usize)>); + delegate!(next_reject -> Option<(usize, usize)>); +} + +unsafe impl<'hs, 'p, T: PartialEq> pattern::ReverseSearcher<&'hs [T]> for Searcher<'hs, 'p, T> { + delegate!(next_back -> SearchStep); + delegate!(next_match_back -> Option<(usize, usize)>); + delegate!(next_reject_back -> Option<(usize, usize)>); +} + +///////////////////////////////////////////////////////////////////////////// +// Searching for an empty pattern +///////////////////////////////////////////////////////////////////////////// + +#[derive(Clone, Debug)] +struct EmptySearcherState { + start: usize, + end: usize, + is_match_fw: bool, + is_match_bw: bool, + // Needed in case of an empty haystack, see #85462 + is_finished: bool, +} + +impl EmptySearcherState { + fn new(haystack_length: usize) -> Self { + Self { + start: 0, + end: haystack_length, + is_match_fw: true, + is_match_bw: true, + is_finished: false, + } + } + + fn next(&mut self) -> SearchStep { + if self.is_finished { + return SearchStep::Done; + } + let is_match = self.is_match_fw; + self.is_match_fw = !self.is_match_fw; + let pos = self.start; + if is_match { + SearchStep::Match(pos, pos) + } else if self.start < self.end { + self.start += 1; + SearchStep::Reject(pos, pos + 1) + } else { + self.is_finished = true; + SearchStep::Done + } + } + + fn next_back(&mut self) -> SearchStep { + if self.is_finished { + return SearchStep::Done; + } + let is_match = self.is_match_bw; + self.is_match_bw = !self.is_match_bw; + let end = self.end; + if is_match { + SearchStep::Match(end, end) + } else if self.end <= self.start { + self.is_finished = true; + SearchStep::Done + } else { + self.end -= 1; + SearchStep::Reject(end - 1, end) + } + } + + fn next_match(&mut self) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next()) + } + + fn next_reject(&mut self) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next()) + } + + fn next_match_back(&mut self) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next_back()) + } + + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next_back()) + } +} + +///////////////////////////////////////////////////////////////////////////// +// Searching for a single element +///////////////////////////////////////////////////////////////////////////// + +/// State of a searcher which tests one element at a time using a provided +/// predicate. +/// +/// Matches are always one-element long. Rejects can be arbitrarily long. +#[derive(Clone, Debug)] +struct PredicateSearchState { + /// Position to start searching from. Updated as we find new matches. + start: usize, + /// Position to end searching at. Updated as we find new matches. + end: usize, + /// If true, we’re finished searching or haystack[start] is a match. + is_match_fw: bool, + /// If true, we’re finished searching or haystack[end-1] is a match. + is_match_bw: bool +} + +impl PredicateSearchState { + fn new(haystack_length: usize) -> Self { + Self { + start: 0, + end: haystack_length, + is_match_fw: false, + is_match_bw: false, + } + } + + fn next(&mut self, hs: &[T], pred: &mut F) -> SearchStep + where F: FnMut(&T) -> bool, + { + if self.start >= self.end { + return SearchStep::Done; + } + let count = if self.is_match_fw { + self.is_match_fw = false; + 0 + } else { + self.count(false, hs, pred) + }; + if count == 0 { + self.start += 1; + SearchStep::Match(self.start - 1, self.start) + } else { + self.is_match_fw = true; + let pos = self.start; + self.start += count; + SearchStep::Reject(pos, self.start) + } + } + + fn next_match(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&T) -> bool, + { + pattern::loop_next::(|| self.next(hs, pred)) + } + + fn next_reject(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&T) -> bool, + { + if self.start >= self.end { + return None; + } + + if self.is_match_fw { + self.start += 1; + } + self.start += self.count(true, hs, pred); + + let count = self.count(false, hs, pred); + if count == 0 { + None + } else { + self.is_match_fw = true; + let pos = self.start; + self.start += count; + Some((pos, self.start)) + } + } + + fn next_back(&mut self, hs: &[T], pred: &mut F) -> SearchStep + where F: FnMut(&T) -> bool + Copy, + { + if self.start >= self.end { + return SearchStep::Done + } + let count = if self.is_match_bw { + self.is_match_bw = false; + 0 + } else { + self.count_back(false, hs, pred) + }; + let pos = self.end; + if count == 0 { + self.end -= 1; + SearchStep::Match(self.end, pos) + } else { + self.is_match_bw = true; + self.end -= count; + SearchStep::Reject(self.end, pos) + } + } + + fn next_match_back(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&T) -> bool + Copy, + { + pattern::loop_next::(|| self.next_back(hs, pred)) + } + + fn next_reject_back(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&T) -> bool + Copy, + { + if self.start >= self.end { + return None; + } + + if self.is_match_fw { + self.end -= 1; + } + self.end -= self.count_back(true, hs, pred); + + let count = self.count_back(false, hs, pred); + if count == 0 { + None + } else { + self.is_match_bw = true; + let pos = self.end; + self.end -= count; + Some((self.end, pos)) + } + } + + fn count(&self, want: bool, hs: &[T], pred: &mut F) -> usize + where F: FnMut(&T) -> bool, + { + hs[self.start..self.end] + .iter() + .map(pred) + .take_while(|&matches| matches == want) + .count() + } + + fn count_back(&self, want: bool, hs: &[T], pred: &mut F) -> usize + where F: FnMut(&T) -> bool, + { + hs[self.start..self.end] + .iter() + .rev() + .map(pred) + .take_while(|&matches| matches == want) + .count() + } +} + +///////////////////////////////////////////////////////////////////////////// +// Searching for a subslice element +///////////////////////////////////////////////////////////////////////////// + +// TODO: Implement something smarter perhaps? Or have specialisation for +// different T? We’re not using core::str::pattern::TwoWaySearcher because it +// requires PartialOrd elements. Specifically, TwoWaySearcher::maximal_suffix +// and TwoWaySearcher::reverse_maximal_suffix methods compare elements. For the +// time being, use naive O(nk) search. +#[derive(Clone, Debug)] +pub(super) struct NaiveSearcherState { + start: usize, + end: usize, + is_match_fw: bool, + is_match_bw: bool, +} + +impl NaiveSearcherState { + pub(super) fn new(haystack_length: usize) -> Self { + Self { + start: 0, + end: haystack_length, + is_match_fw: false, + is_match_bw: false, + } + } + + pub(super) fn next(&mut self, haystack: &[T], needle: &[T]) -> SearchStep { + if self.end - self.start < needle.len() { + SearchStep::Done + } else if self.is_match_fw { + let pos = self.start; + self.start += needle.len(); + self.is_match_fw = false; + SearchStep::Match(pos, self.start) + } else { + let count = haystack[self.start..self.end] + .windows(needle.len()) + .take_while(|window| *window != needle) + .count(); + let pos = self.start; + if count == 0 { + self.start += needle.len(); + SearchStep::Match(pos, self.start) + } else { + let pos = self.start; + self.start += count; + // We’ve either reached the end of the haystack or start + // where it matches so maker is_match_fw. + self.is_match_fw = true; + SearchStep::Reject(pos, self.start) + } + } + } + + pub(super) fn next_back(&mut self, haystack: &[T], needle: &[T]) -> SearchStep { + if self.end - self.start < needle.len() { + SearchStep::Done + } else if self.is_match_bw { + let pos = self.end; + self.end -= needle.len(); + self.is_match_bw = false; + SearchStep::Match(self.end, pos) + } else { + let count = haystack[self.start..self.end] + .windows(needle.len()) + .rev() + .take_while(|window| *window != needle) + .count(); + let pos = self.end; + if count == 0 { + self.end -= needle.len(); + SearchStep::Match(self.end, pos) + } else { + self.end -= count; + // We’ve either reached the end of the haystack or start + // where it matches so maker is_match_bw. + self.is_match_bw = true; + SearchStep::Reject(self.end, pos) + } + } + } + + pub(super) fn next_match(&mut self, haystack: &[T], needle: &[T]) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next(haystack, needle)) + } + + pub(super) fn next_reject(&mut self, haystack: &[T], needle: &[T]) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next(haystack, needle)) + } + + pub(super) fn next_match_back(&mut self, haystack: &[T], needle: &[T]) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next_back(haystack, needle)) + } + + pub(super) fn next_reject_back(&mut self, haystack: &[T], needle: &[T]) -> Option<(usize, usize)> { + pattern::loop_next::(|| self.next_back(haystack, needle)) + } +} diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index f12f0c77f2207..d98780f960da1 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -50,7 +50,6 @@ )] use crate::cmp; -use crate::cmp::Ordering; use crate::fmt; use crate::pattern::{DoubleEndedSearcher, Haystack, Pattern, ReverseSearcher, Searcher, SearchStep}; use crate::slice::memchr; @@ -289,34 +288,37 @@ impl<'a> Pattern<&'a str> for char { haystack.as_bytes().contains(&(self as u8)) } else { let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) + let chr: &str = self.encode_utf8(&mut buffer); + chr.is_contained_in(haystack) } } #[inline] fn is_prefix_of(self, haystack: &'a str) -> bool { - self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack) + let mut buffer = [0u8; 4]; + let chr: &str = self.encode_utf8(&mut buffer); + chr.is_prefix_of(haystack) } #[inline] fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack) + let mut buffer = [0u8; 4]; + let chr: &str = self.encode_utf8(&mut buffer); + chr.strip_prefix_of(haystack) } #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where - Self::Searcher: ReverseSearcher<&'a str>, - { - self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) + fn is_suffix_of(self, haystack: &'a str) -> bool { + let mut buffer = [0u8; 4]; + let chr: &str = self.encode_utf8(&mut buffer); + chr.is_suffix_of(haystack) } #[inline] - fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> - where - Self::Searcher: ReverseSearcher<&'a str>, - { - self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { + let mut buffer = [0u8; 4]; + let chr: &str = self.encode_utf8(&mut buffer); + chr.strip_suffix_of(haystack) } } @@ -680,27 +682,7 @@ impl<'a, 'b> Pattern<&'a str> for &'b str { /// Checks whether the pattern matches anywhere in the haystack #[inline] fn is_contained_in(self, haystack: &'a str) -> bool { - if self.len() == 0 { - return true; - } - - match self.len().cmp(&haystack.len()) { - Ordering::Less => { - if self.len() == 1 { - return haystack.as_bytes().contains(&self.as_bytes()[0]); - } - - #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] - if self.len() <= 32 { - if let Some(result) = simd_contains(self, haystack) { - return result; - } - } - - self.into_searcher(haystack).next_match().is_some() - } - _ => self == haystack, - } + self.as_bytes().is_contained_in(haystack.as_bytes()) } /// Removes the pattern from the front of haystack, if it matches. @@ -1441,210 +1423,3 @@ impl TwoWayStrategy for RejectAndMatch { SearchStep::Match(a, b) } } - -/// SIMD search for short needles based on -/// Wojciech Muła's "SIMD-friendly algorithms for substring searching"[0] -/// -/// It skips ahead by the vector width on each iteration (rather than the needle length as two-way -/// does) by probing the first and last byte of the needle for the whole vector width -/// and only doing full needle comparisons when the vectorized probe indicated potential matches. -/// -/// Since the x86_64 baseline only offers SSE2 we only use u8x16 here. -/// If we ever ship std with for x86-64-v3 or adapt this for other platforms then wider vectors -/// should be evaluated. -/// -/// For haystacks smaller than vector-size + needle length it falls back to -/// a naive O(n*m) search so this implementation should not be called on larger needles. -/// -/// [0]: http://0x80.pl/articles/simd-strfind.html#sse-avx2 -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] -#[inline] -fn simd_contains(needle: &str, haystack: &str) -> Option { - let needle = needle.as_bytes(); - let haystack = haystack.as_bytes(); - - debug_assert!(needle.len() > 1); - - use crate::ops::BitAnd; - use crate::simd::mask8x16 as Mask; - use crate::simd::u8x16 as Block; - use crate::simd::{SimdPartialEq, ToBitMask}; - - let first_probe = needle[0]; - let last_byte_offset = needle.len() - 1; - - // the offset used for the 2nd vector - let second_probe_offset = if needle.len() == 2 { - // never bail out on len=2 needles because the probes will fully cover them and have - // no degenerate cases. - 1 - } else { - // try a few bytes in case first and last byte of the needle are the same - let Some(second_probe_offset) = (needle.len().saturating_sub(4)..needle.len()).rfind(|&idx| needle[idx] != first_probe) else { - // fall back to other search methods if we can't find any different bytes - // since we could otherwise hit some degenerate cases - return None; - }; - second_probe_offset - }; - - // do a naive search if the haystack is too small to fit - if haystack.len() < Block::LANES + last_byte_offset { - return Some(haystack.windows(needle.len()).any(|c| c == needle)); - } - - let first_probe: Block = Block::splat(first_probe); - let second_probe: Block = Block::splat(needle[second_probe_offset]); - // first byte are already checked by the outer loop. to verify a match only the - // remainder has to be compared. - let trimmed_needle = &needle[1..]; - - // this #[cold] is load-bearing, benchmark before removing it... - let check_mask = #[cold] - |idx, mask: u16, skip: bool| -> bool { - if skip { - return false; - } - - // and so is this. optimizations are weird. - let mut mask = mask; - - while mask != 0 { - let trailing = mask.trailing_zeros(); - let offset = idx + trailing as usize + 1; - // SAFETY: mask is between 0 and 15 trailing zeroes, we skip one additional byte that was already compared - // and then take trimmed_needle.len() bytes. This is within the bounds defined by the outer loop - unsafe { - let sub = haystack.get_unchecked(offset..).get_unchecked(..trimmed_needle.len()); - if small_slice_eq(sub, trimmed_needle) { - return true; - } - } - mask &= !(1 << trailing); - } - return false; - }; - - let test_chunk = |idx| -> u16 { - // SAFETY: this requires at least LANES bytes being readable at idx - // that is ensured by the loop ranges (see comments below) - let a: Block = unsafe { haystack.as_ptr().add(idx).cast::().read_unaligned() }; - // SAFETY: this requires LANES + block_offset bytes being readable at idx - let b: Block = unsafe { - haystack.as_ptr().add(idx).add(second_probe_offset).cast::().read_unaligned() - }; - let eq_first: Mask = a.simd_eq(first_probe); - let eq_last: Mask = b.simd_eq(second_probe); - let both = eq_first.bitand(eq_last); - let mask = both.to_bitmask(); - - return mask; - }; - - let mut i = 0; - let mut result = false; - // The loop condition must ensure that there's enough headroom to read LANE bytes, - // and not only at the current index but also at the index shifted by block_offset - const UNROLL: usize = 4; - while i + last_byte_offset + UNROLL * Block::LANES < haystack.len() && !result { - let mut masks = [0u16; UNROLL]; - for j in 0..UNROLL { - masks[j] = test_chunk(i + j * Block::LANES); - } - for j in 0..UNROLL { - let mask = masks[j]; - if mask != 0 { - result |= check_mask(i + j * Block::LANES, mask, result); - } - } - i += UNROLL * Block::LANES; - } - while i + last_byte_offset + Block::LANES < haystack.len() && !result { - let mask = test_chunk(i); - if mask != 0 { - result |= check_mask(i, mask, result); - } - i += Block::LANES; - } - - // Process the tail that didn't fit into LANES-sized steps. - // This simply repeats the same procedure but as right-aligned chunk instead - // of a left-aligned one. The last byte must be exactly flush with the string end so - // we don't miss a single byte or read out of bounds. - let i = haystack.len() - last_byte_offset - Block::LANES; - let mask = test_chunk(i); - if mask != 0 { - result |= check_mask(i, mask, result); - } - - Some(result) -} - -/// Compares short slices for equality. -/// -/// It avoids a call to libc's memcmp which is faster on long slices -/// due to SIMD optimizations but it incurs a function call overhead. -/// -/// # Safety -/// -/// Both slices must have the same length. -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] // only called on x86 -#[inline] -unsafe fn small_slice_eq(x: &[u8], y: &[u8]) -> bool { - debug_assert_eq!(x.len(), y.len()); - // This function is adapted from - // https://github.com/BurntSushi/memchr/blob/8037d11b4357b0f07be2bb66dc2659d9cf28ad32/src/memmem/util.rs#L32 - - // If we don't have enough bytes to do 4-byte at a time loads, then - // fall back to the naive slow version. - // - // Potential alternative: We could do a copy_nonoverlapping combined with a mask instead - // of a loop. Benchmark it. - if x.len() < 4 { - for (&b1, &b2) in x.iter().zip(y) { - if b1 != b2 { - return false; - } - } - return true; - } - // When we have 4 or more bytes to compare, then proceed in chunks of 4 at - // a time using unaligned loads. - // - // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is - // that this particular version of memcmp is likely to be called with tiny - // needles. That means that if we do 8 byte loads, then a higher proportion - // of memcmp calls will use the slower variant above. With that said, this - // is a hypothesis and is only loosely supported by benchmarks. There's - // likely some improvement that could be made here. The main thing here - // though is to optimize for latency, not throughput. - - // SAFETY: Via the conditional above, we know that both `px` and `py` - // have the same length, so `px < pxend` implies that `py < pyend`. - // Thus, derefencing both `px` and `py` in the loop below is safe. - // - // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual - // end of `px` and `py`. Thus, the final dereference outside of the - // loop is guaranteed to be valid. (The final comparison will overlap with - // the last comparison done in the loop for lengths that aren't multiples - // of four.) - // - // Finally, we needn't worry about alignment here, since we do unaligned - // loads. - unsafe { - let (mut px, mut py) = (x.as_ptr(), y.as_ptr()); - let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4)); - while px < pxend { - let vx = (px as *const u32).read_unaligned(); - let vy = (py as *const u32).read_unaligned(); - if vx != vy { - return false; - } - px = px.add(4); - py = py.add(4); - } - let vx = (pxend as *const u32).read_unaligned(); - let vy = (pyend as *const u32).read_unaligned(); - vx == vy - } -} From 587e3355d2754d7d105290c5dbb6014750e561b3 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Thu, 16 Feb 2023 05:11:26 +0100 Subject: [PATCH 5/6] Predicate --- library/core/src/pattern.rs | 20 ++++++ library/core/src/slice/pattern.rs | 100 ++++++++++++++---------------- 2 files changed, 66 insertions(+), 54 deletions(-) diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index 955ddef854dec..b403dfd530469 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -36,6 +36,8 @@ issue = "27721" )] +use crate::marker::PhantomData; + /// A pattern which can be matched against a [`Haystack`]. /// /// A `Pattern` expresses that the implementing type can be used as a pattern @@ -325,6 +327,24 @@ pub unsafe trait ReverseSearcher: Searcher { pub trait DoubleEndedSearcher: ReverseSearcher {} +/// XXX TODO placeholder +#[derive(Clone, Debug)] +pub struct Predicate(F, PhantomData<*const T>); + +/// XXX TODO placeholder +pub fn predicate bool>(pred: F) -> Predicate { + Predicate(pred, PhantomData) +} + +impl bool> Predicate { + /// XXX TODO placeholder + pub fn test(&mut self, element: T) -> bool { self.0(element) } + + /// XXX TODO placeholder + pub fn as_fn(&mut self) -> &mut F { &mut self.0 } +} + + /// Calls callback until it returns `SearchStep::Done` or either `Match` or /// `Reject` depending no `MATCH` generic argument. pub(super) fn loop_next( diff --git a/library/core/src/slice/pattern.rs b/library/core/src/slice/pattern.rs index b0c4ddd8bbbaa..baa81ad007001 100644 --- a/library/core/src/slice/pattern.rs +++ b/library/core/src/slice/pattern.rs @@ -4,7 +4,7 @@ issue = "27721" )] -use crate::pattern::{Haystack, Pattern, SearchStep}; +use crate::pattern::{Haystack, Pattern, Predicate, SearchStep}; use crate::pattern; use super::cmp::SliceContains; @@ -131,17 +131,9 @@ unsafe impl<'hs, 'p, T: PartialEq> pattern::ReverseSearcher<&'hs [T]> for Elemen impl<'hs, 'p, T: PartialEq> pattern::DoubleEndedSearcher<&'hs [T]> for ElementSearcher<'hs, 'p, T> {} ///////////////////////////////////////////////////////////////////////////// -// Impl Pattern for &mut FnMut(&T) and &mut FnMut(T) +// Impl Pattern for Predicate ///////////////////////////////////////////////////////////////////////////// -/* - - XXX TODO those don’t actually work because the implementations conflict with - implementation for &T. This is actually kind of a pain. It may mean that we - will need some kind of core::pattern::Pred wrapper. I think that would work - then. - - /// Pattern implementation for searching for an element matching given /// predicate. /// @@ -149,93 +141,93 @@ impl<'hs, 'p, T: PartialEq> pattern::DoubleEndedSearcher<&'hs [T]> for ElementSe /// /// ``` /// # #![feature(pattern)] +/// use core::pattern::predicate; /// /// let nums = &[10, 40, 30, 40]; -/// assert_eq!(nums.find(|n| n % 3 == 0), Some(2)); -/// assert_eq!(nums.find(|n| n % 2 == 1), None); +/// assert_eq!(nums.find(predicate(|n| n % 3 == 0)), Some(2)); +/// assert_eq!(nums.find(predicate(|n| n % 2 == 1)), None); /// ``` -impl<'hs, 'p, T, F: FnMut(&T) -> bool> Pattern<&'hs [T]> for &'p F { - type Searcher = PredicateSearcher<'hs, 'p, T, F>; +impl<'hs, T, F: FnMut(&'hs T) -> bool> Pattern<&'hs [T]> for Predicate<&'hs T, F> { + type Searcher = PredicateSearcher<'hs, T, F>; fn into_searcher(self, haystack: &'hs [T]) -> Self::Searcher { Self::Searcher::new(haystack, self) } - fn is_contained_in(self, haystack: &'hs [T]) -> bool { - haystack.iter().any(self) + fn is_contained_in(mut self, haystack: &'hs [T]) -> bool { + haystack.iter().any(|element| self.test(element)) } - fn is_prefix_of(self, haystack: &'hs [T]) -> bool { - haystack.first().filter(self).is_some() + fn is_prefix_of(mut self, haystack: &'hs [T]) -> bool { + haystack.first().filter(|element| self.test(element)).is_some() } - fn is_suffix_of(self, haystack: &'hs [T]) -> bool { - haystack.last().filter(self).is_some() + fn is_suffix_of(mut self, haystack: &'hs [T]) -> bool { + haystack.last().filter(|element| self.test(element)).is_some() } - fn strip_prefix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + fn strip_prefix_of(mut self, haystack: &'hs [T]) -> Option<&'hs [T]> { match haystack.split_first() { - Some((first, tail)) if self(first) => Some(tail), + Some((first, tail)) if self.test(first) => Some(tail), _ => None, } } - fn strip_suffix_of(self, haystack: &'hs [T]) -> Option<&'hs [T]> { + fn strip_suffix_of(mut self, haystack: &'hs [T]) -> Option<&'hs [T]> { match haystack.split_last() { - Some((last, head)) if self(last) => Some(head), + Some((last, head)) if self.test(last) => Some(head), _ => None, } } } -pub struct PredicateSearcher<'hs, 'p, T, F> { +#[derive(Clone, Debug)] +pub struct PredicateSearcher<'hs, T, F> { /// Haystack we’re searching in. haystack: &'hs [T], /// Predicate used to match elements. - pred: &'p mut F, + pred: Predicate<&'hs T, F>, /// Internal state of the searcher. state: PredicateSearchState, } -impl<'hs, 'p, T, F> PredicateSearcher<'hs, 'p, T, F> { - fn new(haystack: &'hs [T], pred: &mut F) -> Self { +impl<'hs, T, F> PredicateSearcher<'hs, T, F> { + fn new(haystack: &'hs [T], pred: Predicate<&'hs T, F>) -> Self { let state = PredicateSearchState::new(haystack.len()); Self { haystack, pred, state } } } -unsafe impl<'hs, 'p, T, F: FnMut(&T) -> bool> pattern::Searcher<&'hs [T]> for PredicateSearcher<'hs, 'p, T, F> { +unsafe impl<'hs, T, F: FnMut(&'hs T) -> bool> pattern::Searcher<&'hs [T]> for PredicateSearcher<'hs, T, F> { fn haystack(&self) -> &'hs [T] { self.haystack } fn next(&mut self) -> SearchStep { - self.state.next(|idx| self.pred(&self.haystack[idx])) + self.state.next(self.haystack, self.pred.as_fn()) } fn next_match(&mut self) -> Option<(usize, usize)> { - self.state.next_match(|idx| self.pred(&self.haystack[idx])) + self.state.next_match(self.haystack, self.pred.as_fn()) } fn next_reject(&mut self) -> Option<(usize, usize)> { - self.state.next_reject(|idx| self.pred(&self.haystack[idx])) + self.state.next_reject(self.haystack, self.pred.as_fn()) } } -unsafe impl<'hs, 'p, T, F: FnMut(&T) -> bool> pattern::ReverseSearcher<&'hs [T]> for PredicateSearcher<'hs, 'p, T, F> { +unsafe impl<'hs, T, F: FnMut(&'hs T) -> bool> pattern::ReverseSearcher<&'hs [T]> for PredicateSearcher<'hs, T, F> { fn next_back(&mut self) -> SearchStep { - self.state.next_back(|idx| self.pred(&self.haystack[idx])) + self.state.next_back(self.haystack, self.pred.as_fn()) } fn next_match_back(&mut self) -> Option<(usize, usize)> { - self.state.next_match_back(|idx| self.pred(&self.haystack[idx])) + self.state.next_match_back(self.haystack, self.pred.as_fn()) } fn next_reject_back(&mut self) -> Option<(usize, usize)> { - self.state.next_reject_back(|idx| self.pred(&self.haystack[idx])) + self.state.next_reject_back(self.haystack, self.pred.as_fn()) } } -*/ - ///////////////////////////////////////////////////////////////////////////// // Impl Pattern for &[T] and &[T; N] ///////////////////////////////////////////////////////////////////////////// @@ -550,8 +542,8 @@ impl PredicateSearchState { } } - fn next(&mut self, hs: &[T], pred: &mut F) -> SearchStep - where F: FnMut(&T) -> bool, + fn next<'hs, T, F>(&mut self, hs: &'hs [T], pred: &mut F) -> SearchStep + where F: FnMut(&'hs T) -> bool, { if self.start >= self.end { return SearchStep::Done; @@ -573,14 +565,14 @@ impl PredicateSearchState { } } - fn next_match(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> - where F: FnMut(&T) -> bool, + fn next_match<'hs, T, F>(&mut self, hs: &'hs [T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&'hs T) -> bool, { pattern::loop_next::(|| self.next(hs, pred)) } - fn next_reject(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> - where F: FnMut(&T) -> bool, + fn next_reject<'hs, T, F>(&mut self, hs: &'hs [T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&'hs T) -> bool, { if self.start >= self.end { return None; @@ -602,8 +594,8 @@ impl PredicateSearchState { } } - fn next_back(&mut self, hs: &[T], pred: &mut F) -> SearchStep - where F: FnMut(&T) -> bool + Copy, + fn next_back<'hs, T, F>(&mut self, hs: &'hs [T], pred: &mut F) -> SearchStep + where F: FnMut(&'hs T) -> bool, { if self.start >= self.end { return SearchStep::Done @@ -625,14 +617,14 @@ impl PredicateSearchState { } } - fn next_match_back(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> - where F: FnMut(&T) -> bool + Copy, + fn next_match_back<'hs, T, F>(&mut self, hs: &'hs [T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&'hs T) -> bool, { pattern::loop_next::(|| self.next_back(hs, pred)) } - fn next_reject_back(&mut self, hs: &[T], pred: &mut F) -> Option<(usize, usize)> - where F: FnMut(&T) -> bool + Copy, + fn next_reject_back<'hs, T, F>(&mut self, hs: &'hs [T], pred: &mut F) -> Option<(usize, usize)> + where F: FnMut(&'hs T) -> bool, { if self.start >= self.end { return None; @@ -654,8 +646,8 @@ impl PredicateSearchState { } } - fn count(&self, want: bool, hs: &[T], pred: &mut F) -> usize - where F: FnMut(&T) -> bool, + fn count<'hs, T, F>(&self, want: bool, hs: &'hs [T], pred: &mut F) -> usize + where F: FnMut(&'hs T) -> bool, { hs[self.start..self.end] .iter() @@ -664,8 +656,8 @@ impl PredicateSearchState { .count() } - fn count_back(&self, want: bool, hs: &[T], pred: &mut F) -> usize - where F: FnMut(&T) -> bool, + fn count_back<'hs, T, F>(&self, want: bool, hs: &'hs [T], pred: &mut F) -> usize + where F: FnMut(&'hs T) -> bool, { hs[self.start..self.end] .iter() From 3c887088fd8e21ffd65df28ead47d1e02cee04a7 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Fri, 17 Feb 2023 14:28:11 +0100 Subject: [PATCH 6/6] unix pattern --- library/core/src/pattern.rs | 1 + library/core/src/str/mod.rs | 2 +- library/core/src/str/validations.rs | 194 ++++++---- library/std/src/ffi/os_str.rs | 169 +++++++++ library/std/src/lib.rs | 2 + library/std/src/sys/unix/mod.rs | 1 + library/std/src/sys/unix/os_str.rs | 1 - library/std/src/sys/unix/os_str_pattern.rs | 419 +++++++++++++++++++++ 8 files changed, 718 insertions(+), 71 deletions(-) create mode 100644 library/std/src/sys/unix/os_str_pattern.rs diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs index b403dfd530469..dfc633e774a3c 100644 --- a/library/core/src/pattern.rs +++ b/library/core/src/pattern.rs @@ -89,6 +89,7 @@ use crate::marker::PhantomData; /// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); /// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); /// ``` +#[rustc_has_incoherent_inherent_impls] pub trait Pattern: Sized { /// Associated searcher for this pattern type Searcher: Searcher; diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 02bb1de1c522d..778a0e23c0407 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -68,7 +68,7 @@ pub use iter::SplitAsciiWhitespace; pub use iter::SplitInclusive; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{next_code_point, try_first_code_point, utf8_char_width}; use iter::MatchIndicesInternal; use iter::SplitInternal; diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 2acef432f2063..0d40b536863c0 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -120,6 +120,38 @@ const fn contains_nonascii(x: usize) -> bool { (x & NONASCII_MASK) != 0 } +/// Reads the first code point out of a byte slice validating whether it’s +/// valid. +/// +/// This is different than [`next_code_point`] in that it doesn’t assume +/// argument is well-formed UTF-8-like string. Together with the character its +/// encoded length is returned. +/// +/// ``` +/// #![feature(str_internals)] +/// use core::str::try_first_code_point; +/// +/// assert_eq!(Some(('f', 1)), try_first_code_point(b"foo".as_ref())); +/// assert_eq!(Some(('Ż', 2)), try_first_code_point("Żółw".as_bytes())); +/// assert_eq!(None, try_first_code_point(b"\xffoo".as_ref())); +/// ``` +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub const fn try_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let first = match bytes.first() { + Some(&byte) => byte, + None => return None, + }; + let (value, length) = if first < 0x80 { + (first as u32, 1) + } else if let Ok((cp, len)) = try_finish_byte_sequence(first, bytes, 0) { + (cp, len) + } else { + return None + }; + Some((unsafe { char::from_u32_unchecked(value) }, length)) +} + /// Walks through `v` checking that it's a valid UTF-8 sequence, /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. #[inline(always)] @@ -134,78 +166,13 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let align = v.as_ptr().align_offset(usize_bytes); while index < len { - let old_offset = index; - macro_rules! err { - ($error_len: expr) => { - return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }) - }; - } - - macro_rules! next { - () => {{ - index += 1; - // we needed data, but there was none: error! - if index >= len { - err!(None) - } - v[index] - }}; - } - + let valid_up_to = index; let first = v[index]; if first >= 128 { - let w = utf8_char_width(first); - // 2-byte encoding is for codepoints \u{0080} to \u{07ff} - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u{0800} to \u{ffff} - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \u{d800} to \u{dfff} - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => { - if next!() as i8 >= -64 { - err!(Some(1)) - } - } - 3 => { - match (first, next!()) { - (0xE0, 0xA0..=0xBF) - | (0xE1..=0xEC, 0x80..=0xBF) - | (0xED, 0x80..=0x9F) - | (0xEE..=0xEF, 0x80..=0xBF) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - } - 4 => { - match (first, next!()) { - (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - if next!() as i8 >= -64 { - err!(Some(3)) - } - } - _ => err!(Some(1)), + match try_finish_byte_sequence(first, v, index) { + Ok((_value, length)) => index += length, + Err(error_len) => return Err(Utf8Error { valid_up_to, error_len }), } - index += 1; } else { // Ascii case, try to skip forward quickly. // When the pointer is aligned, read 2 words of data per iteration @@ -241,6 +208,95 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { Ok(()) } +/// Try to finish an UTF-8 byte sequence. +/// +/// Assumes that `bytes[index] == first` and than `first >= 128`, i.e. that +/// `index` points at the beginning of a non-ASCII UTF-8 sequence in `bytes`. +/// +/// If the byte sequence at the index is correct, returns decoded code point and +/// length of the sequence. If it was invalid returns number of invalid bytes +/// or None if read was cut short. +#[inline(always)] +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +const fn try_finish_byte_sequence( + first: u8, + bytes: &[u8], + index: usize, +) -> Result<(u32, usize), Option> { + macro_rules! get { + (raw $offset:expr) => { + match bytes.get(index + $offset) { + Some(byte) => *byte, + None => return Err(None), + } + }; + (cont $offset:expr) => {{ + let byte = get!(raw $offset); + if !utf8_is_cont_byte(byte) { + return Err(Some($offset as u8)) + } + byte + }} + } + + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match utf8_char_width(first) { + 2 => { + let second = get!(cont 1); + let value = utf8_first_byte(first, 3); + let value = utf8_acc_cont_byte(value, second); + Ok((value, 2)) + } + 3 => { + let second = get!(raw 1); + match (first, second) { + (0xE0 , 0xA0..=0xBF) | + (0xE1..=0xEC, 0x80..=0xBF) | + (0xED , 0x80..=0x9F) | + (0xEE..=0xEF, 0x80..=0xBF) => {} + _ => return Err(Some(1)), + } + let value = utf8_first_byte(first, 3); + let value = utf8_acc_cont_byte(value, second); + let value = utf8_acc_cont_byte(value, get!(cont 2)); + Ok((value, 3)) + } + 4 => { + let second = get!(raw 1); + match (first, second) { + (0xF0 , 0x90..=0xBF) | + (0xF1..=0xF3, 0x80..=0xBF) | + (0xF4 , 0x80..=0x8F) => {} + _ => return Err(Some(1)), + } + let value = utf8_first_byte(first, 4); + let value = utf8_acc_cont_byte(value, second); + let value = utf8_acc_cont_byte(value, get!(cont 2)); + let value = utf8_acc_cont_byte(value, get!(cont 3)); + Ok((value, 4)) + } + _ => Err(Some(1)), + } +} + + // https://tools.ietf.org/html/rfc3629 const UTF8_CHAR_WIDTH: &[u8; 256] = &[ // 1 2 3 4 5 6 7 8 9 A B C D E F diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 80ed34157e6dc..4c3a72b662bfe 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -8,11 +8,13 @@ use crate::fmt; use crate::hash::{Hash, Hasher}; use crate::iter::Extend; use crate::ops; +use crate::pattern::{DoubleEndedSearcher, Pattern, Searcher, SearchStep, ReverseSearcher}; use crate::rc::Rc; use crate::str::FromStr; use crate::sync::Arc; use crate::sys::os_str::{Buf, Slice}; +use crate::sys::os_str_pattern::Slice as Slice2; use crate::sys_common::{AsInner, FromInner, IntoInner}; /// A type that can represent owned, mutable platform-native strings, but is @@ -979,6 +981,82 @@ impl OsStr { pub fn eq_ignore_ascii_case>(&self, other: S) -> bool { self.inner.eq_ignore_ascii_case(&other.as_ref().inner) } + + + /// XXX placeholder + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert!(OsStr::new("foo").starts_with('f')); + /// assert!(!OsStr::new("foo").starts_with('F')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn starts_with<'a, P: Pattern<&'a OsStr>>(&'a self, pat: P) -> bool { + pat.is_prefix_of(self) + } + + /// XXX placeholder + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert!(OsStr::new("foo").ends_with('o')); + /// assert!(!OsStr::new("foo").ends_with('O')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn ends_with<'a, P>(&'a self, pat: P) -> bool + where P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + { + pat.is_suffix_of(self) + } + + /// XXX placeholder + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some(OsSttr::from("oo")), OsStr::new("foo").strip_prefix('f')); + /// assert_eq!(None, OsStr::new("foo").strip_prefix('F')); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[unstable(feature = "pattern", issue = "27721")] + pub fn strip_prefix<'a, P>(&'a self, prefix: P) -> Option<&'a OsStr> + where P: Pattern<&'a OsStr> + { + prefix.strip_prefix_of(self) + } + + /// XXX placeholder + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some(OsSttr::from("fo")), OsStr::new("foo").strip_suffix('o')); + /// assert_eq!(None, OsStr::new("foo").strip_suffix('O')); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[unstable(feature = "pattern", issue = "27721")] + pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a OsStr> + where P: Pattern<&'a OsStr>, +

>::Searcher: ReverseSearcher<&'a OsStr>, + { + suffix.strip_suffix_of(self) + } } #[stable(feature = "box_from_os_str", since = "1.17.0")] @@ -1446,3 +1524,94 @@ impl<'a> FromIterator> for OsString { } } } + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> crate::pattern::Haystack for &'hs OsStr { + type Cursor = usize; + + fn cursor_at_front(&self) -> usize { 0 } + fn cursor_at_back(&self) -> usize { self.inner.inner.len() } + + unsafe fn split_at_cursor_unchecked(self, cursor: usize) -> (Self, Self) { + let bytes = &self.inner.inner; + unsafe { + let head = bytes.get_unchecked(..cursor); + // XXX + let head = core::mem::transmute(head); + + let tail = bytes.get_unchecked(cursor..); + // XXX + let tail = core::mem::transmute(tail); + + (head, tail) + } + } +} + +macro_rules! define_pattern { + ($pattern:ty) => { + #[unstable(feature = "pattern", issue = "27721")] + impl<'hs> Pattern<&'hs OsStr> for $pattern { + type Searcher = SearcherImpl<<$pattern as Pattern<&'hs Slice2>>::Searcher>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + let haystack: &'hs Slice2 = unsafe { core::mem::transmute(haystack) }; + Self::Searcher::new(self.into_searcher(haystack)) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + let haystack: &'hs Slice2 = unsafe { core::mem::transmute(haystack) }; + self.is_contained_in(haystack) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + let haystack: &'hs Slice2 = unsafe { core::mem::transmute(haystack) }; + self.is_prefix_of(haystack) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + let haystack: &'hs Slice2 = unsafe { core::mem::transmute(haystack) }; + self.is_suffix_of(haystack) + } + + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + let haystack: &'hs Slice2 = unsafe { core::mem::transmute(haystack) }; + self.strip_prefix_of(haystack).map(core::mem::transmute) + } + + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + let haystack: &'hs Slice2 = unsafe { core::mem::transmute(haystack) }; + self.strip_suffix_of(haystack).map(core::mem::transmute) + } + } + + #[unstable(feature = "pattern", issue = "27721")] + pub struct SearcherImpl(S); + + #[unstable(feature = "pattern", issue = "27721")] + unsafe impl<'hs, S> Searcher<&'hs OsStr> for SearcherImpl + where S: Searcher<&'hs Slice2> + { + fn haystack(&self) -> &'hs OsStr { core::mem::transmute(self.0.haystack()) } + + fn next(&mut self) -> SearchStep { self.0.next() } + fn next_match(&mut self) -> Option<(usize, usize)> { self.0.next_match() } + fn next_reject(&mut self) -> Option<(usize, usize)> { self.0.next_reject() } + } + + #[unstable(feature = "pattern", issue = "27721")] + unsafe impl<'hs, S> ReverseSearcher<&'hs OsStr> for SearcherImpl + where S: ReverseSearcher<&'hs Slice2> + { + fn next_back(&mut self) -> SearchStep { self.0.next_back() } + fn next_match_back(&mut self) -> Option<(usize, usize)> { self.0.next_match_back() } + fn next_reject_back(&mut self) -> Option<(usize, usize)> { self.0.next_reject_back() } + } + + #[unstable(feature = "pattern", issue = "27721")] + impl<'hs, S> DoubleEndedSearcher<&'hs OsStr> for SearcherImpl + where S: DoubleEndedSearcher<&'hs Slice2> {} + } +} + +define_pattern!(char); diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index e247185e1bf65..836e35f427b14 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -238,6 +238,7 @@ #![feature(allocator_internals)] #![feature(allow_internal_unsafe)] #![feature(allow_internal_unstable)] +#![feature(associated_type_bounds)] #![feature(c_unwind)] #![feature(cfg_target_thread_local)] #![feature(concat_idents)] @@ -294,6 +295,7 @@ #![feature(panic_can_unwind)] #![feature(panic_info_message)] #![feature(panic_internals)] +#![feature(pattern)] #![feature(pointer_byte_offsets)] #![feature(pointer_is_aligned)] #![feature(portable_simd)] diff --git a/library/std/src/sys/unix/mod.rs b/library/std/src/sys/unix/mod.rs index 30a96be14300a..3a6c4ca3b1211 100644 --- a/library/std/src/sys/unix/mod.rs +++ b/library/std/src/sys/unix/mod.rs @@ -31,6 +31,7 @@ pub mod net; pub use self::l4re::net; pub mod os; pub mod os_str; +pub mod os_str_pattern; pub mod path; pub mod pipe; pub mod process; diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index 017e2af29d4f4..5c2480f6b94aa 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -14,7 +14,6 @@ use crate::sys_common::{AsInner, IntoInner}; use core::str::Utf8Chunks; #[cfg(test)] -#[path = "../unix/os_str/tests.rs"] mod tests; #[derive(Hash)] diff --git a/library/std/src/sys/unix/os_str_pattern.rs b/library/std/src/sys/unix/os_str_pattern.rs new file mode 100644 index 0000000000000..22fbb655b2061 --- /dev/null +++ b/library/std/src/sys/unix/os_str_pattern.rs @@ -0,0 +1,419 @@ +#![unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] + +use core::pattern::{Haystack, Pattern, SearchStep}; +use core::pattern; +use core::str::try_first_code_point; + +#[derive(Debug)] +pub struct Slice { + pub inner: [u8], +} + +impl Slice { + #[inline] + fn from_u8_slice(s: &[u8]) -> &Slice { + unsafe { core::mem::transmute(s) } + } +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +///////////////////////////////////////////////////////////////////////////// + +impl<'hs> Haystack for &'hs Slice { + type Cursor = usize; + + fn cursor_at_front(&self) -> usize { 0 } + fn cursor_at_back(&self) -> usize { self.inner.len() } + + unsafe fn split_at_cursor_unchecked(self, pos: usize) -> (Self, Self) { + // SAFETY: Caller promises cursor is valid. + unsafe { (get_unchecked(&self, ..pos), get_unchecked(&self, pos..)) } + } +} + +///////////////////////////////////////////////////////////////////////////// +// Impl Pattern for char +///////////////////////////////////////////////////////////////////////////// + +impl<'hs> Pattern<&'hs Slice> for char { + type Searcher = CharSearcher<'hs>; + + fn into_searcher(self, slice: &'hs Slice) -> Self::Searcher { + Self::Searcher::new(slice, self) + } + + fn is_contained_in(self, slice: &'hs Slice) -> bool { + let mut buf = [0; 4]; + slice.inner.contains(self.encode_utf8(&mut buf).as_bytes()) + } + + fn is_prefix_of(self, slice: &'hs Slice) -> bool { + let mut buf = [0; 4]; + slice.inner.starts_with(self.encode_utf8(&mut buf).as_bytes()) + } + + fn is_suffix_of(self, slice: &'hs Slice) -> bool { + let mut buf = [0; 4]; + slice.inner.ends_with(self.encode_utf8(&mut buf).as_bytes()) + } + + fn strip_prefix_of(self, slice: &'hs Slice) -> Option<&'hs Slice> { + let mut buf = [0; 4]; + let needle = self.encode_utf8(&mut buf).as_bytes(); + slice.inner.starts_with(needle).then(|| { + // SAFETY: We’ve just checked slice starts with needle. + unsafe { get_unchecked(slice, needle.len()..) } + }) + } + + fn strip_suffix_of(self, slice: &'hs Slice) -> Option<&'hs Slice> { + let mut buf = [0; 4]; + let needle = self.encode_utf8(&mut buf).as_bytes(); + slice.inner.ends_with(needle).then(|| { + // SAFETY: We’ve just checked slice starts with needle. + unsafe { get_unchecked(slice, ..slice.inner.len() - needle.len()) } + }) + } +} + +#[derive(Clone, Debug)] +pub struct CharSearcher<'hs> { + /// Zero-padded UTF-8 encoded character we’re searching for. + _needle: Box<[u8; 4]>, + /// Slice searcher over the slice. + searcher: <&'hs [u8] as Pattern<&'hs [u8]>>::Searcher, +} + +impl<'hs> CharSearcher<'hs> { + fn new(slice: &'hs Slice, chr: char) -> Self { + let mut buf = [0; 4]; + let len = chr.encode_utf8(&mut buf).len(); + let needle = Box::new(buf); + // XXX: This is potentially unsound? We’re transmuting needle’s + // lifetime to 'hs which is definitely not true, but at the same time + // Searcher dies when needle dies so it won’t reference it after it + // dies. + let pattern: &'hs [u8] = unsafe { core::mem::transmute(&needle[..len]) }; + Self { + _needle: needle, + searcher: pattern.into_searcher(&slice.inner) + } + } +} + +unsafe impl<'hs> pattern::Searcher<&'hs Slice> for CharSearcher<'hs> { + fn haystack(&self) -> &'hs Slice { + Slice::from_u8_slice(self.searcher.haystack()) + } + + fn next(&mut self) -> SearchStep { + self.searcher.next() + } + + fn next_match(&mut self) -> Option<(usize, usize)> { + self.searcher.next_match() + } + + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.searcher.next_match() + } +} + +unsafe impl<'hs> pattern::ReverseSearcher<&'hs Slice> for CharSearcher<'hs> { + fn next_back(&mut self) -> SearchStep { + self.searcher.next_back() + } + + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.searcher.next_match_back() + } + + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.searcher.next_match_back() + } +} + +impl<'hs> pattern::DoubleEndedSearcher<&'hs Slice> for CharSearcher<'hs> {} + +///////////////////////////////////////////////////////////////////////////// +// Impl Pattern for &FnMut(char) +///////////////////////////////////////////////////////////////////////////// + +// XXX TODO +// This is work-around of the following: +// error[E0210]: type parameter `F` must be covered by another type when it +// appears before the first local type (`pattern::Slice`) +// --> library/std/src/sys/unix/os_str/pattern.rs:148:11 +// | +// 148 | impl<'hs, F: FnMut(char) -> bool> Pattern<&'hs Slice> for F { +// | ^ type parameter `F` must be covered by another type when +// it appears before the first local type (`pattern::Slice`) +// | +pub struct Predicate(F); + +#[rustc_has_incoherent_inherent_impls] +impl<'hs, F: FnMut(char) -> bool> Pattern<&'hs Slice> for F { + type Searcher = PredicateSearcher<'hs, F>; + + fn into_searcher(self, slice: &'hs Slice) -> Self::Searcher { + Self::Searcher::new(slice, self) + } + + fn is_prefix_of(mut self, slice: &'hs Slice) -> bool { + matches!(try_first_code_point(&slice.inner), + Some((chr, _)) if self(chr)) + } + + fn is_suffix_of(mut self, slice: &'hs Slice) -> bool { + matches!(try_last_code_point(&slice.inner), + Some((chr, _)) if self(chr)) + } + + fn strip_prefix_of(mut self, slice: &'hs Slice) -> Option<&'hs Slice> { + let bytes = &slice.inner; + if let Some((chr, len)) = try_first_code_point(bytes) { + if self(chr) { + return Some(Slice::from_u8_slice(&bytes[len..])); + } + } + None + } + + fn strip_suffix_of(mut self, slice: &'hs Slice) -> Option<&'hs Slice> { + let bytes = &slice.inner; + if let Some((chr, len)) = try_last_code_point(bytes) { + if self(chr) { + return Some(Slice::from_u8_slice(&bytes[..bytes.len() - len])); + } + } + None + } +} + +#[derive(Clone, Debug)] +pub struct PredicateSearcher<'hs, F> { + slice: &'hs Slice, + pred: F, + + start: usize, + end: usize, + fw_match_len: usize, + bw_match_len: usize, +} + +impl<'hs, F: FnMut(char) -> bool> PredicateSearcher<'hs, F> { + fn new(slice: &'hs Slice, pred: F) -> Self { + Self { + slice: slice, + pred, + start: 0, + end: 0, + fw_match_len: 0, + bw_match_len: 0, + } + } + + /// Looks for the next match and returns its position and length. Doesn’t + /// update searcher’s state. + fn next_match_impl(&mut self) -> Option<(usize, usize)> { + let bytes = &self.slice.inner[..self.end]; + let mut pos = self.start; + while pos < bytes.len() { + pos += count_utf8_cont_bytes(bytes[pos..].iter()); + if let Some((chr, len)) = try_first_code_point(&bytes[pos..]) { + if (self.pred)(chr) { + return Some((pos, len)) + } + pos += len; + } else { + pos += 1; + } + } + None + } + + /// Implementation of Searcher::next and Searcher::next_match functions. + fn next_impl(&mut self) -> R { + while self.start < self.end { + if self.fw_match_len == 0 { + let (pos, len) = self.next_match_impl().unwrap_or((self.end, 0)); + self.fw_match_len = len; + let start = self.start; + if pos != start { + self.start = pos; + if let Some(ret) = R::rejecting(start, pos) { + return ret; + } + } + } + + debug_assert_ne!(0, self.fw_match_len); + let pos = self.start; + self.start += self.fw_match_len; + self.fw_match_len = 0; + if let Some(ret) = R::matching(pos, self.start) { + return ret; + } + } + R::DONE + } + + /// Looks for the next match back and returns its position and length. + /// Doesn’t update searcher’s state. + fn next_match_back_impl(&mut self) -> Option<(usize, usize)> { + let mut bytes = &self.slice.inner[self.start..self.end]; + while !bytes.is_empty() { + let pos = bytes.len() - count_utf8_cont_bytes(bytes.iter().rev()); + let pos = pos.checked_sub(1)?; + if let Some((chr, len)) = try_first_code_point(&bytes[pos..]) { + if (self.pred)(chr) { + return Some((pos + self.start, len)) + } + } + bytes = &bytes[..pos] + } + None + } + + /// Implementation of ReverseSearcher::next and ReverseSearcher::next_match + /// functions. + fn next_back_impl(&mut self) -> R { + while self.start < self.end { + if self.bw_match_len == 0 { + let end = self.end; + let (pos, len) = self.next_match_back_impl().unwrap_or((end, 0)); + self.bw_match_len = len; + if pos + len != end { + self.end = pos + len; + if let Some(ret) = R::rejecting(self.end, end) { + return ret; + } + } + } + + debug_assert_ne!(0, self.bw_match_len); + let end = self.end; + self.end -= self.bw_match_len; + self.bw_match_len = 0; + if let Some(ret) = R::matching(self.end, end) { + return ret; + } + } + R::DONE + } +} + +unsafe impl<'hs, F: FnMut(char) -> bool> pattern::Searcher<&'hs Slice> for PredicateSearcher<'hs, F> { + fn haystack(&self) -> &'hs Slice { self.slice } + + fn next(&mut self) -> SearchStep { + self.next_impl() + } + + fn next_match(&mut self) -> Option<(usize, usize)> { + self.next_impl::().0 + } + + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.next_impl::().0 + } +} + +unsafe impl<'hs, F: FnMut(char) -> bool> pattern::ReverseSearcher<&'hs Slice> for PredicateSearcher<'hs, F> { + fn next_back(&mut self) -> SearchStep { + self.next_back_impl() + } + + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.next_back_impl::().0 + } + + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.next_back_impl::().0 + } +} + +impl<'hs, F: FnMut(char) -> bool> pattern::DoubleEndedSearcher<&'hs Slice> for PredicateSearcher<'hs, F> {} + +///////////////////////////////////////////////////////////////////////////// + +/// Possible return type of a search. +/// +/// It abstract differences between `next`, `next_match` and `next_reject` +/// methods. Depending on return type an implementation for those functions +/// will generate matches and rejects, only matches or only rejects. +trait SearchReturn: Sized { + const DONE: Self; + fn matching(start: usize, end: usize) -> Option; + fn rejecting(start: usize, end: usize) -> Option; +} + +struct MatchOnly(Option<(usize, usize)>); +struct RejectOnly(Option<(usize, usize)>); + +impl SearchReturn for SearchStep { + const DONE: Self = SearchStep::Done; + fn matching(s: usize, e: usize) -> Option { + Some(SearchStep::Match(s, e)) + } + fn rejecting(s: usize, e: usize) ->Option { + Some(SearchStep::Reject(s, e)) + } +} + +impl SearchReturn for MatchOnly { + const DONE: Self = Self(None); + fn matching(s: usize, e: usize) -> Option { Some(Self(Some((s, e)))) } + fn rejecting(_s: usize, _e: usize) -> Option { None } +} + +impl SearchReturn for RejectOnly { + const DONE: Self = Self(None); + fn matching(_s: usize, _e: usize) -> Option { None } + fn rejecting(s: usize, e: usize) -> Option { Some(Self(Some((s, e)))) } +} + + +unsafe fn get_unchecked(slice: &Slice, index: I) -> &Slice +where I: core::slice::SliceIndex<[u8], Output = [u8]>, +{ + // SAFETY: Caller Promises index is valid. + Slice::from_u8_slice(unsafe { slice.inner.get_unchecked(index) }) +} + + +/// Tries to extract UTF-8 sequence from the end of the slice. +/// +/// If last bytes of the slice don’t form a valid UTF-8 sequence (or if slice is +/// empty), returns `None`. If they do, decodes the character and returns its +/// encoded length. +fn try_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + // Fast path: ASCII + let last = *bytes.last()?; + if last < 0x80 { + return Some((unsafe { char::from_u32_unchecked(last as u32) }, 1)); + } + + // Count how many continuation bytes there are at the end. + let count = count_utf8_cont_bytes(bytes.iter().rev().take(4)); + if count == bytes.len() || count >= 4 { + return None; + } + let pos = bytes.len() - count - 1; + + // Try decode. If length matches, we have ourselves a character. + let (chr, len) = try_first_code_point(&bytes[pos..])?; + (len == count + 1).then_some((chr, len)) +} + + +/// Counts UTF-8 continuation bytes at the beginning of the iterator. +#[inline] +fn count_utf8_cont_bytes<'a>(bytes: impl Iterator) -> usize { + bytes.take_while(|&&byte| (byte as i8) < -64).count() +}