pandas-dev · lithomas1 · Apr 4, 2023 · Apr 26, 2023 · May 15, 2023 · May 19, 2023
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -42,6 +42,7 @@
     // followed by the pip installed packages).
     "matrix": {
         "pip+build": [],
+        "numpy": ["2.0rc1"],
         "Cython": ["3.0"],
         "matplotlib": [],
         "sqlalchemy": [],

@@ -25,6 +25,9 @@ from pandas._libs.khash cimport (
     are_equivalent_float64_t,
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
+    kh_end,
+    kh_exist,
+    kh_key,
     kh_needed_n_buckets,
     kh_python_hash_equal,
     kh_python_hash_func,

@@ -5,6 +5,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 from cpython.unicode cimport PyUnicode_AsUTF8
 
+from numpy cimport (
+    flatiter,
+    PyArray_GETITEM,
+    PyArray_ITER_DATA,
+    PyArray_ITER_NEXT,
+    PyArray_IterNew,
+)
+
+
+from libc.string cimport strdup
+
 {{py:
 
 # name
@@ -970,7 +981,12 @@ cdef class StringHashTable(HashTable):
         kh_resize_str(self.table, size_hint)
 
     def __dealloc__(self):
+        cdef:
+            khiter_t k
         if self.table is not NULL:
+            for k in range(kh_end(self.table)):
+                if kh_exist(self.table, k):
+                    free(<char*>kh_key(self.table, k))
             kh_destroy_str(self.table)
             self.table = NULL
 
@@ -1013,6 +1029,8 @@ cdef class StringHashTable(HashTable):
 
         v = PyUnicode_AsUTF8(key)
 
+        v = strdup(v)
+
         k = kh_put_str(self.table, v, &ret)
         if kh_exist_str(self.table, k):
             self.table.vals[k] = val
@@ -1051,7 +1069,7 @@ cdef class StringHashTable(HashTable):
         return labels
 
     @cython.boundscheck(False)
-    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
+    def lookup(self, ndarray values, object mask = None) -> ndarray:
         # -> np.ndarray[np.intp]
         # mask not yet implemented
         cdef:
@@ -1061,22 +1079,34 @@ cdef class StringHashTable(HashTable):
             const char *v
             khiter_t k
             intp_t[::1] locs = np.empty(n, dtype=np.intp)
+            flatiter it = PyArray_IterNew(values)
 
         # these by-definition *must* be strings
         vecs = <const char **>malloc(n * sizeof(char *))
         if vecs is NULL:
             raise MemoryError()
         for i in range(n):
-            val = values[i]
+            val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
 
             if isinstance(val, str):
                 # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
                 #  it as a str, even though isinstance does.
                 v = PyUnicode_AsUTF8(<str>val)
             else:
                 v = PyUnicode_AsUTF8(self.na_string_sentinel)
+
+            # Need to copy result from PyUnicode_AsUTF8 when we have
+            # numpy strings
+            # Since numpy strings aren't backed by object arrays
+            # the buffer returned by PyUnicode_AsUTF8 will get freed
+            # in the next iteration when the created str object is GC'ed,
+            # clobbering the value of v
+            v = strdup(v)
+
             vecs[i] = v
 
+            PyArray_ITER_NEXT(it)
+
         with nogil:
             for i in range(n):
                 v = vecs[i]
@@ -1086,11 +1116,16 @@ cdef class StringHashTable(HashTable):
                 else:
                     locs[i] = -1
 
+        if values.dtype.kind == "T":
+            # free copied strings
+            for i in range(n):
+                free(vecs[i])
+
         free(vecs)
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def map_locations(self, ndarray[object] values, object mask = None) -> None:
+    def map_locations(self, ndarray values, object mask = None) -> None:
         # mask not yet implemented
         cdef:
             Py_ssize_t i, n = len(values)
@@ -1099,32 +1134,45 @@ cdef class StringHashTable(HashTable):
             const char *v
             const char **vecs
             khiter_t k
+            flatiter it = PyArray_IterNew(values)
 
         # these by-definition *must* be strings
         vecs = <const char **>malloc(n * sizeof(char *))
         if vecs is NULL:
             raise MemoryError()
         for i in range(n):
-            val = values[i]
+            val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
 
             if isinstance(val, str):
                 # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
                 #  it as a str, even though isinstance does.
                 v = PyUnicode_AsUTF8(<str>val)
             else:
                 v = PyUnicode_AsUTF8(self.na_string_sentinel)
+
+            # Need to copy result from PyUnicode_AsUTF8 when we have
+            # numpy strings
+            # Since numpy strings aren't backed by object arrays
+            # the buffer returned by PyUnicode_AsUTF8 will get freed
+            # in the next iteration when the created str object is GC'ed,
+            # clobbering the value of v
+            v = strdup(v)
+
             vecs[i] = v
 
+            PyArray_ITER_NEXT(it)
+
         with nogil:
             for i in range(n):
                 v = vecs[i]
                 k = kh_put_str(self.table, v, &ret)
                 self.table.vals[k] = i
+
         free(vecs)
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique(self, ndarray[object] values, ObjectVector uniques,
+    def _unique(self, ndarray values, ObjectVector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                 object na_value=None, bint ignore_na=False,
                 bint return_inverse=False):
@@ -1171,11 +1219,13 @@ cdef class StringHashTable(HashTable):
             const char **vecs
             khiter_t k
             bint use_na_value
+            flatiter it = PyArray_IterNew(values)
             bint non_null_na_value
 
         if return_inverse:
             labels = np.zeros(n, dtype=np.intp)
         uindexer = np.empty(n, dtype=np.int64)
+
         use_na_value = na_value is not None
         non_null_na_value = not checknull(na_value)
 
@@ -1184,7 +1234,7 @@ cdef class StringHashTable(HashTable):
         if vecs is NULL:
             raise MemoryError()
         for i in range(n):
-            val = values[i]
+            val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
 
             if (ignore_na
                 and (not isinstance(val, str)
@@ -1202,10 +1252,22 @@ cdef class StringHashTable(HashTable):
                 # if ignore_na is False, we also stringify NaN/None/etc.
                 try:
                     v = PyUnicode_AsUTF8(<str>val)
-                except UnicodeEncodeError:
+                except (UnicodeEncodeError,TypeError):
+                    # pd.NA will raise TypeError
                     v = PyUnicode_AsUTF8(<str>repr(val))
+
+                # Need to copy result from PyUnicode_AsUTF8 when we have
+                # numpy strings
+                # Since numpy strings aren't backed by object arrays
+                # the buffer returned by PyUnicode_AsUTF8 will get freed
+                # in the next iteration when the created str object is GC'ed,
+                # clobbering the value of v
+                v = strdup(v)
+
                 vecs[i] = v
 
+            PyArray_ITER_NEXT(it)
+
         # compute
         with nogil:
             for i in range(n):
@@ -1239,7 +1301,7 @@ cdef class StringHashTable(HashTable):
             return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
-    def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None):
+    def unique(self, ndarray values, *, bint return_inverse=False, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1264,7 +1326,7 @@ cdef class StringHashTable(HashTable):
         return self._unique(values, uniques, ignore_na=False,
                             return_inverse=return_inverse)
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+    def factorize(self, ndarray values, Py_ssize_t na_sentinel=-1,
                   object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)

@@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h":
 
     khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
 
+    # Needed to free the strings we copied in StringHashTable
+
+    khuint_t kh_end(kh_str_t* h) nogil
+
+    int kh_exist(kh_str_t* h, khuint_t x) nogil
+
+    void* kh_key(kh_str_t* h, khuint_t x) nogil
+
 
 include "khash_for_primitive_helper.pxi"
@@ -675,41 +675,36 @@ def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool:
     return True
 
 
-ctypedef fused ndarr_object:
-    ndarray[object, ndim=1]
-    ndarray[object, ndim=2]
-
 # TODO: get rid of this in StringArray and modify
 #  and go through ensure_string_array instead
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def convert_nans_to_NA(ndarr_object arr) -> ndarray:
+def convert_nans_to_NA(ndarray arr) -> ndarray:
     """
     Helper for StringArray that converts null values that
     are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
     have already been validated as null.
     """
     cdef:
-        Py_ssize_t i, m, n
+        Py_ssize_t i
+        Py_ssize_t n = len(arr)
         object val
-        ndarr_object result
-    result = np.asarray(arr, dtype="object")
-    if arr.ndim == 2:
-        m, n = arr.shape[0], arr.shape[1]
-        for i in range(m):
-            for j in range(n):
-                val = arr[i, j]
-                if not isinstance(val, str):
-                    result[i, j] = <object>C_NA
-    else:
-        n = len(arr)
-        for i in range(n):
-            val = arr[i]
-            if not isinstance(val, str):
-                result[i] = <object>C_NA
-    return result
+        flatiter it = cnp.PyArray_IterNew(arr)
+
+    for i in range(n):
+        # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
+        #  equivalents to `val = values[i]`
+        val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
+
+        # Not string so has to be null since they're already validated
+        if not isinstance(val, str):
+            val = <object>C_NA
+
+        PyArray_SETITEM(arr, PyArray_ITER_DATA(it), val)
+
+        PyArray_ITER_NEXT(it)
 
 
 @cython.wraparound(False)
@@ -1475,6 +1470,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     - mixed
     - unknown-array
 
+    Returns a dtype object for non-legacy numpy dtypes
+
     Raises
     ------
     TypeError
@@ -1585,6 +1582,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     if inferred is not None:
         # Anything other than object-dtype should return here.
         return inferred
+    elif values.dtype.kind == "T":
+        # NumPy StringDType
+        return values.dtype
 
     if values.descr.type_num != NPY_OBJECT:
         # i.e. values.dtype != np.object_
@@ -1600,7 +1600,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     it = PyArray_IterNew(values)
     for i in range(n):
         # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
-        #  equivalents to `val = values[i]`
+        # equivalents to `val = values[i]`
         val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
         PyArray_ITER_NEXT(it)
 
@@ -1911,7 +1911,10 @@ cdef class StringValidator(Validator):
         return isinstance(value, str)
 
     cdef bint is_array_typed(self) except -1:
-        return self.dtype.type_num == cnp.NPY_UNICODE
+        if self.dtype.char == "T" or self.dtype.char == "U":
+            return True
+        # this lets user-defined string DTypes through
+        return issubclass(<object>self.dtype.typeobj, (np.str_, str))
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):

@@ -1,3 +1,5 @@
+from typing import overload
+
 import numpy as np
 from numpy import typing as npt
 
@@ -12,5 +14,10 @@ def is_matching_na(
 def isposinf_scalar(val: object) -> bool: ...
 def isneginf_scalar(val: object) -> bool: ...
 def checknull(val: object) -> bool: ...
-def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
+@overload
+def isnaobj(arr: np.ndarray, check_for_any_na=...) -> npt.NDArray[np.bool_]: ...
+@overload
+def isnaobj(
+    arr: np.ndarray, check_for_any_na=True
+) -> tuple[npt.NDArray[np.bool_], bool]: ...
 def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
@@ -33,5 +33,6 @@
     "PeriodArray",
     "SparseArray",
     "StringArray",
+    "ObjectStringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -150,7 +150,6 @@ def pytest_collection_modifyitems(items, config) -> None:
         ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"),
         ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"),
         ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"),
-        ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"),
         ("NDFrame.reindex_like", "keyword argument 'method' is deprecated"),
         # Docstring divides by zero to show behavior difference
         ("missing.mask_zero_div_zero", "divide by zero encountered"),