Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Reimplement DataFrame.lookup #61185

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
dev setup
  • Loading branch information
stevenae committed Mar 26, 2025
commit a4057e5649e9d5488f1bc5d2ce8a22e342af6ae2
39 changes: 39 additions & 0 deletions dev_attempts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas as pd
import numpy as np
import timeit
np.random.seed(43)
# also tested for n = 1000, 10_000, 100_000
n=1_000_000
cols = list('abcdef')
df = pd.DataFrame(np.random.randint(0, 10, size=(n,len(cols))), columns=cols)
df['col'] = np.random.choice(cols, n)
idx = df['col'].index.to_numpy()
cols = df['col'].to_numpy()

def og_lookup(idx, cols):
return df.lookup(idx, cols,'og')

# def melt_lookup():
# melt = df.melt('col')
# melt = melt.loc[lambda x: x['col']==x['variable'], 'value']
# melt = melt.reset_index(drop=True)
# return melt

# def quan_lookup(idx,cols):
# return df.reindex(cols,axis=1).to_numpy()[np.arange(df.shape[0]), idx]

# def quan_lookup2(idx,cols):
# return df.reindex(cols,axis=1).to_numpy()[np.arange(df.shape[0]), idx]

# def marco_lookup():
# return df.melt('col', ignore_index=False).query('col==variable')['value'].reindex(df.index).to_numpy()


timeit.timeit(lambda: og_lookup(idx,cols),number=10)
# timeit.timeit(lambda: melt_lookup(idx,cols),number=10)
# timeit.timeit(lambda: quan_lookup(idx,cols),number=10)
# timeit.timeit(lambda: quan_lookup2(idx,cols),number=10)
# timeit.timeit(lambda: marco_lookup(idx,cols),number=10)

# idx, cols = pd.factorize(df['col'])
# df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
55 changes: 55 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_object_dtype,
is_scalar,
is_sequence,
needs_i8_conversion,
Expand Down Expand Up @@ -5135,6 +5136,60 @@ def _series(self):
# ----------------------------------------------------------------------
# Reindexing and alignment

def lookup(self, row_labels, col_labels, dev_version) -> np.ndarray:
"""
Label-based "fancy indexing" function for DataFrame.


Given equal-length arrays of row and column labels, return an
array of the values corresponding to each (row, col) pair.


Parameters
----------
row_labels : sequence
The row labels to use for lookup.
col_labels : sequence
The column labels to use for lookup.


Returns
-------
numpy.ndarray
The found values.
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be really useful to have an example here in the docs for the API.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added, please take a look.

n = len(row_labels)
if n != len(col_labels):
raise ValueError("Row labels must have same size as column labels")
if not (self.index.is_unique and self.columns.is_unique):
# GH#33041
raise ValueError("DataFrame.lookup requires unique index and columns")


thresh = 1000
if not self._is_mixed_type or n > thresh:
values = self.values
ridx = self.index.get_indexer(row_labels)
cidx = self.columns.get_indexer(col_labels)
if (ridx == -1).any():
raise KeyError("One or more row labels was not found")
if (cidx == -1).any():
raise KeyError("One or more column labels was not found")
flat_index = ridx * len(self.columns) + cidx
result = values.flat[flat_index]
else:
if dev_version=='og':
result = np.empty(n, dtype="O")
for i, (r, c) in enumerate(zip(row_labels, col_labels)):
result[i] = self._get_value(r, c)


if is_object_dtype(result):
result = lib.maybe_convert_objects(result)


return result

def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame:
"""
We are guaranteed non-Nones in the axes.
Expand Down