diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index e6c442159336b..16033dd75204c 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -162,7 +162,7 @@ previously results in ``Exception`` or ``TypeError`` (:issue:`7812`) didx didx.tz_localize(None) -- ``DataFrame.tz_localize`` and ``DataFrame.tz_convert`` now accepts an optional ``level`` argument +- ``DataFrame.tz_localize`` and ``DataFrame.tz_convert`` now accepts an optional ``level`` argument for localizing a specific level of a MultiIndex (:issue:`7846`) .. _whatsnew_0150.refactoring: @@ -302,6 +302,7 @@ Performance - Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) - Performance improvements in ``Period`` creation (and ``PeriodIndex`` setitem) (:issue:`5155`) +- Improvements in Series.transform for significant performance gains (revised) (:issue:`6496`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 484d1d413c6c6..f26a7269772a3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2270,14 +2270,21 @@ def transform(self, func, *args, **kwargs): ------- transformed : Series """ - dtype = self._selected_obj.dtype + # if string function if isinstance(func, compat.string_types): - wrapper = lambda x: getattr(x, func)(*args, **kwargs) - else: - wrapper = lambda x: func(x, *args, **kwargs) + return self._transform_fast(lambda : getattr(self, func)(*args, **kwargs)) + + # do we have a cython function + cyfunc = _intercept_cython(func) + if cyfunc and not args and not kwargs: + return self._transform_fast(cyfunc) + # reg transform + dtype = self._selected_obj.dtype result = self._selected_obj.values.copy() + + wrapper = lambda x: func(x, *args, **kwargs) for i, (name, group) in enumerate(self): object.__setattr__(group, 'name', name) @@ -2302,6 +2309,29 @@ def transform(self, func, *args, **kwargs): index=self._selected_obj.index, name=self._selected_obj.name) + def _transform_fast(self, func): + """ + fast version of transform, only applicable to builtin/cythonizable functions + """ + if isinstance(func, compat.string_types): + func = getattr(self,func) + values = func().values + counts = self.count().values + values = np.repeat(values, counts) + + # the values/counts are repeated according to the group index + indices = self.indices + + # shortcut of we have an already ordered grouper + if Index(self.grouper.group_info[0]).is_monotonic: + result = Series(values, index=self.obj.index) + else: + index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ])) + result = Series(values, index=index).sort_index() + result.index = self.obj.index + + return result + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a Series excluding elements from groups that diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c88ba836886bf..f621b0fb94eaf 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -795,6 +795,20 @@ def test_transform(self): transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) + def test_transform_fast(self): + + df = DataFrame( { 'id' : np.arange( 100000 ) / 3, + 'val': np.random.randn( 100000) } ) + + grp=df.groupby('id')['val'] + + expected = pd.Series(np.repeat(grp.mean().values, grp.count().values),index=df.index) + result = grp.transform(np.mean) + assert_series_equal(result,expected) + + result = grp.transform('mean') + assert_series_equal(result,expected) + def test_transform_broadcast(self): grouped = self.ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) @@ -858,12 +872,14 @@ def test_transform_select_columns(self): assert_frame_equal(result, expected) def test_transform_exclude_nuisance(self): + + # this also tests orderings in transform between + # series/frame to make sure its consistent expected = {} grouped = self.df.groupby('A') expected['C'] = grouped['C'].transform(np.mean) expected['D'] = grouped['D'].transform(np.mean) expected = DataFrame(expected) - result = self.df.groupby('A').transform(np.mean) assert_frame_equal(result, expected) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 9f520df122c2f..788f228c81edd 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -444,5 +444,13 @@ def f(g): df = DataFrame({ 'signal' : np.random.rand(N)}) """ - groupby_transform_series = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup) + +setup = common_setup + """ +np.random.seed(0) + +df=DataFrame( { 'id' : np.arange( 100000 ) / 3, + 'val': np.random.randn( 100000) } ) +""" + +groupby_transform_series2 = Benchmark("df.groupby('id')['val'].transform(np.mean)", setup)