Skip to content

Commit 43a1d95

Browse files
committed
ENH: starting on general multi-append/join stuff, refactor groupby code
1 parent 41e682d commit 43a1d95

File tree

4 files changed

+169
-129
lines changed

4 files changed

+169
-129
lines changed

doc/source/basics.rst

+11-1
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,17 @@ For example:
731731
iterrows
732732
~~~~~~~~
733733

734-
New in v0.7 is the ability to iterate efficiently through rows of a DataFrame.
734+
New in v0.7 is the ability to iterate efficiently through rows of a
735+
DataFrame. It returns an iterator yielding each index value along with a Series
736+
containing the data in each row:
737+
738+
.. ipython::
739+
740+
In [0]: for row_index, row in df2.iterrows():
741+
...: print '%s\n%s' % (row_index, row)
742+
...:
743+
744+
735745
For instance, a contrived way to transpose the dataframe would be:
736746

737747
.. ipython:: python

doc/source/reshaping.rst

+6-8
Original file line numberDiff line numberDiff line change
@@ -205,14 +205,12 @@ For instance,
205205

206206
.. ipython:: python
207207
208-
df = DataFrame({'first' : ['John', 'Mary'],
209-
'last' : ['Doe', 'Bo'],
210-
'height' : [5.5, 6.0],
211-
'weight' : [130, 150]})
212-
213-
df
214-
215-
melt(df, id_vars=['first', 'last'])
208+
cheese = DataFrame({'first' : ['John', 'Mary'],
209+
'last' : ['Doe', 'Bo'],
210+
'height' : [5.5, 6.0],
211+
'weight' : [130, 150]})
212+
cheese
213+
melt(cheese, id_vars=['first', 'last'])
216214
217215
Combining with stats and GroupBy
218216
--------------------------------

pandas/core/groupby.py

+10-120
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55

66
from pandas.core.frame import DataFrame
7-
from pandas.core.generic import NDFrame, PandasObject
7+
from pandas.core.generic import NDFrame
88
from pandas.core.index import Index, MultiIndex
99
from pandas.core.internals import BlockManager
1010
from pandas.core.series import Series
@@ -446,12 +446,14 @@ def _wrap_applied_output(self, *args, **kwargs):
446446
raise NotImplementedError
447447

448448
def _wrap_frames(self, keys, values, not_indexed_same=False):
449+
from pandas.tools.merge import concat, _concat_frames_hierarchical
450+
449451
if not_indexed_same:
450452
result = _concat_frames_hierarchical(values, keys,
451453
self.groupings,
452454
axis=self.axis)
453455
else:
454-
result = _concat_frames(values, self.obj.index)
456+
result = concat(values, axis=0).reindex(self.obj.index)
455457

456458
return result
457459

@@ -1115,6 +1117,8 @@ def transform(self, func, *args, **kwargs):
11151117
>>> grouped = df.groupby(lambda x: mapping[x])
11161118
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
11171119
"""
1120+
import pandas.tools.merge as merge
1121+
11181122
applied = []
11191123

11201124
obj = self._obj_with_exclusions
@@ -1138,124 +1142,10 @@ def transform(self, func, *args, **kwargs):
11381142
else:
11391143
applied.append(res)
11401144

1141-
return _concat_frames(applied, obj.index, obj.columns,
1142-
axis=self.axis)
1143-
1144-
def _concat_frames(frames, index, columns=None, axis=0):
1145-
if len(frames) == 1:
1146-
return frames[0]
1147-
1148-
if axis == 0:
1149-
new_index = _concat_indexes([x.index for x in frames])
1150-
if columns is None:
1151-
new_columns = frames[0].columns
1152-
else:
1153-
new_columns = columns
1154-
else:
1155-
new_columns = _concat_indexes([x.columns for x in frames])
1156-
new_index = index
1157-
1158-
if frames[0]._is_mixed_type:
1159-
new_data = {}
1160-
for col in new_columns:
1161-
new_data[col] = np.concatenate([x[col].values for x in frames])
1162-
return DataFrame(new_data, index=new_index, columns=new_columns)
1163-
else:
1164-
new_values = np.concatenate([x.values for x in frames], axis=axis)
1165-
result = DataFrame(new_values, index=new_index, columns=new_columns)
1166-
return result.reindex(index=index, columns=columns)
1167-
1168-
def _concat_indexes(indexes):
1169-
return indexes[0].append(indexes[1:])
1170-
1171-
def _concat_frames_hierarchical(frames, keys, groupings, axis=0):
1172-
if axis == 0:
1173-
indexes = [x.index for x in frames]
1174-
new_index = _make_concat_multiindex(indexes, keys, groupings)
1175-
new_columns = frames[0].columns
1176-
else:
1177-
all_columns = [x.columns for x in frames]
1178-
new_columns = _make_concat_multiindex(all_columns, keys, groupings)
1179-
new_index = frames[0].index
1180-
1181-
if frames[0]._is_mixed_type:
1182-
new_data = {}
1183-
for col in new_columns:
1184-
new_data[col] = np.concatenate([x[col].values for x in frames])
1185-
return DataFrame(new_data, index=new_index, columns=new_columns)
1186-
else:
1187-
new_values = np.concatenate([x.values for x in frames], axis=axis)
1188-
return DataFrame(new_values, index=new_index, columns=new_columns)
1189-
1190-
def _make_concat_multiindex(indexes, keys, groupings):
1191-
names = [ping.name for ping in groupings]
1192-
1193-
if not _all_indexes_same(indexes):
1194-
label_list = []
1195-
1196-
# things are potentially different sizes, so compute the exact labels
1197-
# for each level and pass those to MultiIndex.from_arrays
1198-
if len(groupings) == 1:
1199-
zipped = [keys]
1200-
else:
1201-
zipped = zip(*keys)
1202-
1203-
for hlevel in zipped:
1204-
to_concat = []
1205-
for k, index in zip(hlevel, indexes):
1206-
to_concat.append(np.repeat(k, len(index)))
1207-
label_list.append(np.concatenate(to_concat))
1208-
1209-
concat_index = _concat_indexes(indexes)
1210-
1211-
# these go at the end
1212-
if isinstance(concat_index, MultiIndex):
1213-
for level in range(concat_index.nlevels):
1214-
label_list.append(concat_index.get_level_values(level))
1215-
else:
1216-
label_list.append(concat_index.values)
1217-
1218-
consensus_name = indexes[0].names
1219-
for index in indexes[1:]:
1220-
if index.names != consensus_name:
1221-
consensus_name = [None] * index.nlevels
1222-
break
1223-
names.extend(consensus_name)
1224-
1225-
return MultiIndex.from_arrays(label_list, names=names)
1226-
1227-
new_index = indexes[0]
1228-
n = len(new_index)
1229-
1230-
names.append(indexes[0].name)
1231-
1232-
# do something a bit more speedy
1233-
levels = [ping.group_index for ping in groupings]
1234-
levels.append(new_index)
1235-
1236-
# construct labels
1237-
labels = []
1238-
1239-
if len(groupings) == 1:
1240-
zipped = [keys]
1241-
else:
1242-
zipped = zip(*keys)
1243-
1244-
for hlevel, ping in zip(zipped, groupings):
1245-
get_id = ping.reverse_ids.__getitem__
1246-
mapped = [get_id(x) for x in hlevel]
1247-
labels.append(np.repeat(mapped, n))
1248-
1249-
# last labels for the new level
1250-
labels.append(np.tile(np.arange(n), len(indexes)))
1251-
return MultiIndex(levels=levels, labels=labels, names=names)
1252-
1253-
def _all_indexes_same(indexes):
1254-
first = indexes[0]
1255-
for index in indexes[1:]:
1256-
if not first.equals(index):
1257-
return False
1258-
return True
1145+
concat_index = obj.columns if self.axis == 0 else obj.index
1146+
concatenated = merge.concat(applied, join_index=concat_index,
1147+
axis=self.axis)
1148+
return concatenated.reindex_like(obj)
12591149

12601150
class PanelGroupBy(GroupBy):
12611151

pandas/tools/merge.py

+142
Original file line numberDiff line numberDiff line change
@@ -574,3 +574,145 @@ def _upcast_blocks(blocks):
574574

575575
# use any ref_items
576576
return _consolidate(new_blocks, newb.ref_items)
577+
578+
579+
#----------------------------------------------------------------------
580+
# Concatenate DataFrame objects
581+
582+
def concat(frames, axis=0, join='outer', join_index=None):
583+
"""
584+
Concatenate DataFrame objects row or column wise
585+
586+
Parameters
587+
----------
588+
frames : list
589+
axis : {0, 1}, default 0
590+
join : {'inner', 'outer'}, default 'outer'
591+
How to handle indexes on other axis
592+
join_index : index-like
593+
594+
Returns
595+
-------
596+
concatenated : DataFrame
597+
"""
598+
return _concat_frames(frames, join_index=join_index, axis=axis)
599+
600+
def _concat_frames(frames, join_index=None, axis=0):
601+
if len(frames) == 1:
602+
return frames[0]
603+
604+
if axis == 0:
605+
new_index = _concat_indexes([x.index for x in frames])
606+
if join_index is None:
607+
new_columns = frames[0].columns
608+
else:
609+
new_columns = join_index
610+
else:
611+
new_columns = _concat_indexes([x.columns for x in frames])
612+
new_index = join_index
613+
614+
if frames[0]._is_mixed_type:
615+
new_data = {}
616+
for col in new_columns:
617+
new_data[col] = np.concatenate([x[col].values for x in frames])
618+
return DataFrame(new_data, index=new_index, columns=new_columns)
619+
else:
620+
new_values = np.concatenate([x.values for x in frames], axis=axis)
621+
return DataFrame(new_values, index=new_index, columns=new_columns)
622+
623+
def _concat_indexes(indexes):
624+
return indexes[0].append(indexes[1:])
625+
626+
def _concat_frames_hierarchical(frames, keys, groupings, axis=0):
627+
names = [ping.name for ping in groupings]
628+
levels = [ping.group_index for ping in groupings]
629+
630+
if axis == 0:
631+
indexes = [x.index for x in frames]
632+
new_index = _make_concat_multiindex(indexes, keys, levels, names)
633+
new_columns = frames[0].columns
634+
else:
635+
all_columns = [x.columns for x in frames]
636+
new_columns = _make_concat_multiindex(all_columns, keys,
637+
levels, names)
638+
new_index = frames[0].index
639+
640+
if frames[0]._is_mixed_type:
641+
new_data = {}
642+
for col in new_columns:
643+
new_data[col] = np.concatenate([x[col].values for x in frames])
644+
return DataFrame(new_data, index=new_index, columns=new_columns)
645+
else:
646+
new_values = np.concatenate([x.values for x in frames], axis=axis)
647+
return DataFrame(new_values, index=new_index, columns=new_columns)
648+
649+
def _make_concat_multiindex(indexes, keys, levels, names):
650+
single_level = len(levels) == 1
651+
652+
if not _all_indexes_same(indexes):
653+
label_list = []
654+
655+
# things are potentially different sizes, so compute the exact labels
656+
# for each level and pass those to MultiIndex.from_arrays
657+
if single_level:
658+
zipped = [keys]
659+
else:
660+
zipped = zip(*keys)
661+
662+
for hlevel in zipped:
663+
to_concat = []
664+
for k, index in zip(hlevel, indexes):
665+
to_concat.append(np.repeat(k, len(index)))
666+
label_list.append(np.concatenate(to_concat))
667+
668+
concat_index = _concat_indexes(indexes)
669+
670+
# these go at the end
671+
if isinstance(concat_index, MultiIndex):
672+
for level in range(concat_index.nlevels):
673+
label_list.append(concat_index.get_level_values(level))
674+
else:
675+
label_list.append(concat_index.values)
676+
677+
consensus_name = indexes[0].names
678+
for index in indexes[1:]:
679+
if index.names != consensus_name:
680+
consensus_name = [None] * index.nlevels
681+
break
682+
names.extend(consensus_name)
683+
684+
return MultiIndex.from_arrays(label_list, names=names)
685+
686+
new_index = indexes[0]
687+
n = len(new_index)
688+
689+
names.append(indexes[0].name)
690+
691+
new_levels = list(levels)
692+
693+
# do something a bit more speedy
694+
new_levels.append(new_index)
695+
696+
# construct labels
697+
labels = []
698+
699+
if single_level:
700+
zipped = [keys]
701+
else:
702+
zipped = zip(*keys)
703+
704+
for hlevel, level in zip(zipped, levels):
705+
mapped = level.get_indexer(hlevel)
706+
labels.append(np.repeat(mapped, n))
707+
708+
# last labels for the new level
709+
labels.append(np.tile(np.arange(n), len(indexes)))
710+
return MultiIndex(levels=new_levels, labels=labels, names=names)
711+
712+
def _all_indexes_same(indexes):
713+
first = indexes[0]
714+
for index in indexes[1:]:
715+
if not first.equals(index):
716+
return False
717+
return True
718+

0 commit comments

Comments
 (0)