Skip to content

optimize str.replace #130223

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion library/alloc/src/str.rs
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ pub use core::str::SplitInclusive;
pub use core::str::SplitWhitespace;
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::pattern;
use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher};
use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher, Utf8Pattern};
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::{Bytes, CharIndices, Chars, from_utf8, from_utf8_mut};
#[stable(feature = "str_escape", since = "1.34.0")]
@@ -268,6 +268,18 @@ impl str {
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn replace<P: Pattern>(&self, from: P, to: &str) -> String {
// Fast path for ASCII to ASCII case.

if let Some(from_byte) = match from.as_utf8_pattern() {
Some(Utf8Pattern::StringPattern([from_byte])) => Some(*from_byte),
Some(Utf8Pattern::CharPattern(c)) => c.as_ascii().map(|ascii_char| ascii_char.to_u8()),
_ => None,
} {
if let [to_byte] = to.as_bytes() {
return unsafe { replace_ascii(self.as_bytes(), from_byte, *to_byte) };
}
}

let mut result = String::new();
let mut last_end = 0;
for (start, part) in self.match_indices(from) {
@@ -661,3 +673,14 @@ fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8) -> Vec<u8> {

out
}
#[inline]
#[cfg(not(test))]
#[cfg(not(no_global_oom_handling))]
#[allow(dead_code)]
/// Faster implementation of string replacement for ASCII to ASCII cases.
/// Should produce fast vectorized code.
unsafe fn replace_ascii(utf8_bytes: &[u8], from: u8, to: u8) -> String {
let result: Vec<u8> = utf8_bytes.iter().map(|b| if *b == from { to } else { *b }).collect();
// SAFETY: We replaced ascii with ascii on valid utf8 strings.
unsafe { String::from_utf8_unchecked(result) }
}
7 changes: 6 additions & 1 deletion library/alloc/src/string.rs
Original file line number Diff line number Diff line change
@@ -53,7 +53,7 @@ use core::ops::AddAssign;
#[cfg(not(no_global_oom_handling))]
use core::ops::Bound::{Excluded, Included, Unbounded};
use core::ops::{self, Range, RangeBounds};
use core::str::pattern::Pattern;
use core::str::pattern::{Pattern, Utf8Pattern};
use core::{fmt, hash, ptr, slice};

#[cfg(not(no_global_oom_handling))]
@@ -2424,6 +2424,11 @@ impl<'b> Pattern for &'b String {
{
self[..].strip_suffix_of(haystack)
}

#[inline]
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
Some(Utf8Pattern::StringPattern(self.as_bytes()))
}
}

macro_rules! impl_eq {
33 changes: 33 additions & 0 deletions library/core/src/str/pattern.rs
Original file line number Diff line number Diff line change
@@ -160,6 +160,19 @@ pub trait Pattern: Sized {
None
}
}

/// Returns the pattern as utf-8 bytes if possible.
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>>;
Copy link
Member

@BurntSushi BurntSushi Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doesn't this return None as a default implementation? As it stands, this seems to cause needless breakage to crates that implement this trait. And even aside from the breakage, as a matter of API design, given that this is an optional optimization, it seems like this should return None by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes 100%, sorry. Fix: #132113

}
/// Result of calling [`Pattern::as_utf8_pattern()`].
/// Can be used for inspecting the contents of a [`Pattern`] in cases
/// where the underlying representation can be represented as UTF-8.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum Utf8Pattern<'a> {
/// Type returned by String and str types.
StringPattern(&'a [u8]),
/// Type returned by char types.
CharPattern(char),
}

// Searcher
@@ -599,6 +612,11 @@ impl Pattern for char {
{
self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack)
}

#[inline]
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
Some(Utf8Pattern::CharPattern(*self))
}
}

/////////////////////////////////////////////////////////////////////////////
@@ -657,6 +675,11 @@ impl<C: MultiCharEq> Pattern for MultiCharEqPattern<C> {
fn into_searcher(self, haystack: &str) -> MultiCharEqSearcher<'_, C> {
MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
}

#[inline]
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
None
}
}

unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
@@ -747,6 +770,11 @@ macro_rules! pattern_methods {
{
($pmap)(self).strip_suffix_of(haystack)
}

#[inline]
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
None
}
};
}

@@ -1022,6 +1050,11 @@ impl<'b> Pattern for &'b str {
None
}
}

#[inline]
fn as_utf8_pattern(&self) -> Option<Utf8Pattern<'_>> {
Some(Utf8Pattern::StringPattern(self.as_bytes()))
}
}

/////////////////////////////////////////////////////////////////////////////