Skip to content

Commit a9bf1fd

Browse files
committed
BUG: bug in many-to-one join in left_join cython routine
1 parent 43a1d95 commit a9bf1fd

File tree

3 files changed

+40
-15
lines changed

3 files changed

+40
-15
lines changed

pandas/src/generate_code.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ def arrmap_%(name)s(ndarray[%(c_type)s] index, object func):
354354
#----------------------------------------------------------------------
355355
# Joins on ordered, unique indices
356356

357+
# right might contain non-unique values
358+
357359
left_join_template = """@cython.wraparound(False)
358360
@cython.boundscheck(False)
359361
def left_join_indexer_%(name)s(ndarray[%(c_type)s] left,
@@ -378,17 +380,20 @@ def left_join_indexer_%(name)s(ndarray[%(c_type)s] left,
378380
i += 1
379381
continue
380382
381-
lval = left[i]
382383
rval = right[j]
383384
384-
if lval == rval:
385+
while i < nleft - 1 and left[i] == rval:
386+
indexer[i] = j
387+
i += 1
388+
389+
if left[i] == right[j]:
385390
indexer[i] = j
386391
i += 1
387392
while i < nleft - 1 and left[i] == rval:
388393
indexer[i] = j
389394
i += 1
390395
j += 1
391-
elif lval > rval:
396+
elif left[i] > rval:
392397
indexer[i] = -1
393398
j += 1
394399
else:

pandas/src/generated.pyx

+24-12
Original file line numberDiff line numberDiff line change
@@ -1427,17 +1427,20 @@ def left_join_indexer_float64(ndarray[float64_t] left,
14271427
i += 1
14281428
continue
14291429

1430-
lval = left[i]
14311430
rval = right[j]
14321431

1433-
if lval == rval:
1432+
while i < nleft - 1 and left[i] == rval:
1433+
indexer[i] = j
1434+
i += 1
1435+
1436+
if left[i] == right[j]:
14341437
indexer[i] = j
14351438
i += 1
14361439
while i < nleft - 1 and left[i] == rval:
14371440
indexer[i] = j
14381441
i += 1
14391442
j += 1
1440-
elif lval > rval:
1443+
elif left[i] > rval:
14411444
indexer[i] = -1
14421445
j += 1
14431446
else:
@@ -1469,17 +1472,20 @@ def left_join_indexer_object(ndarray[object] left,
14691472
i += 1
14701473
continue
14711474

1472-
lval = left[i]
14731475
rval = right[j]
14741476

1475-
if lval == rval:
1477+
while i < nleft - 1 and left[i] == rval:
1478+
indexer[i] = j
1479+
i += 1
1480+
1481+
if left[i] == right[j]:
14761482
indexer[i] = j
14771483
i += 1
14781484
while i < nleft - 1 and left[i] == rval:
14791485
indexer[i] = j
14801486
i += 1
14811487
j += 1
1482-
elif lval > rval:
1488+
elif left[i] > rval:
14831489
indexer[i] = -1
14841490
j += 1
14851491
else:
@@ -1511,17 +1517,20 @@ def left_join_indexer_int32(ndarray[int32_t] left,
15111517
i += 1
15121518
continue
15131519

1514-
lval = left[i]
15151520
rval = right[j]
15161521

1517-
if lval == rval:
1522+
while i < nleft - 1 and left[i] == rval:
1523+
indexer[i] = j
1524+
i += 1
1525+
1526+
if left[i] == right[j]:
15181527
indexer[i] = j
15191528
i += 1
15201529
while i < nleft - 1 and left[i] == rval:
15211530
indexer[i] = j
15221531
i += 1
15231532
j += 1
1524-
elif lval > rval:
1533+
elif left[i] > rval:
15251534
indexer[i] = -1
15261535
j += 1
15271536
else:
@@ -1553,17 +1562,20 @@ def left_join_indexer_int64(ndarray[int64_t] left,
15531562
i += 1
15541563
continue
15551564

1556-
lval = left[i]
15571565
rval = right[j]
15581566

1559-
if lval == rval:
1567+
while i < nleft - 1 and left[i] == rval:
1568+
indexer[i] = j
1569+
i += 1
1570+
1571+
if left[i] == right[j]:
15601572
indexer[i] = j
15611573
i += 1
15621574
while i < nleft - 1 and left[i] == rval:
15631575
indexer[i] = j
15641576
i += 1
15651577
j += 1
1566-
elif lval > rval:
1578+
elif left[i] > rval:
15671579
indexer[i] = -1
15681580
j += 1
15691581
else:

pandas/tests/test_tseries.py

+8
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@ def test_pad(self):
7272
expect_filler = [-1, -1, -1, -1, -1]
7373
self.assert_(np.array_equal(filler, expect_filler))
7474

75+
def test_left_join_indexer():
76+
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
77+
b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
78+
79+
result = lib.left_join_indexer_int64(b, a)
80+
expected = np.array([1, 1, 2, 3, 3], dtype='i4')
81+
assert(np.array_equal(result, expected))
82+
7583
def test_inner_join_indexer():
7684
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
7785
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)

0 commit comments

Comments
 (0)