8th day of python challenges 111-117
This commit is contained in:
@@ -0,0 +1,562 @@
|
||||
"""
|
||||
test .agg behavior / note that .apply is tested generally in test_groupby.py
|
||||
"""
|
||||
from collections import OrderedDict
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
|
||||
from pandas.core.base import SpecificationError
|
||||
from pandas.core.groupby.generic import _maybe_mangle_lambdas
|
||||
from pandas.core.groupby.grouper import Grouping
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_agg_regression1(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_must_agg(df):
|
||||
grouped = df.groupby("A")["C"]
|
||||
|
||||
msg = "Must produce aggregated value"
|
||||
with pytest.raises(Exception, match=msg):
|
||||
grouped.agg(lambda x: x.describe())
|
||||
with pytest.raises(Exception, match=msg):
|
||||
grouped.agg(lambda x: x.index[:2])
|
||||
|
||||
|
||||
def test_agg_ser_multi_key(df):
|
||||
# TODO(wesm): unused
|
||||
ser = df.C # noqa
|
||||
|
||||
f = lambda x: x.sum()
|
||||
results = df.C.groupby([df.A, df.B]).aggregate(f)
|
||||
expected = df.groupby(["A", "B"]).sum()["C"]
|
||||
tm.assert_series_equal(results, expected)
|
||||
|
||||
|
||||
def test_groupby_aggregation_mixed_dtype():
|
||||
|
||||
# GH 6212
|
||||
expected = DataFrame(
|
||||
{
|
||||
"v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
|
||||
"v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
(1, 95),
|
||||
(1, 99),
|
||||
(2, 95),
|
||||
(2, 99),
|
||||
("big", "damp"),
|
||||
("blue", "dry"),
|
||||
("red", "red"),
|
||||
("red", "wet"),
|
||||
],
|
||||
names=["by1", "by2"],
|
||||
),
|
||||
)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
|
||||
"v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
|
||||
"by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
|
||||
"by2": [
|
||||
"wet",
|
||||
"dry",
|
||||
99,
|
||||
95,
|
||||
np.nan,
|
||||
"damp",
|
||||
95,
|
||||
99,
|
||||
"red",
|
||||
99,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["by1", "by2"])
|
||||
result = g[["v1", "v2"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_apply_corner(ts, tsframe):
|
||||
# nothing to group, all NA
|
||||
grouped = ts.groupby(ts * np.nan)
|
||||
assert ts.dtype == np.float64
|
||||
|
||||
# groupby float64 values results in Float64Index
|
||||
exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64))
|
||||
tm.assert_series_equal(grouped.sum(), exp)
|
||||
tm.assert_series_equal(grouped.agg(np.sum), exp)
|
||||
tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
|
||||
|
||||
# DataFrame
|
||||
grouped = tsframe.groupby(tsframe["A"] * np.nan)
|
||||
exp_df = DataFrame(
|
||||
columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)
|
||||
)
|
||||
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
|
||||
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
|
||||
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
|
||||
|
||||
|
||||
def test_agg_grouping_is_list_tuple(ts):
|
||||
df = tm.makeTimeDataFrame()
|
||||
|
||||
grouped = df.groupby(lambda x: x.year)
|
||||
grouper = grouped.grouper.groupings[0].grouper
|
||||
grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_python_multiindex(mframe):
|
||||
grouped = mframe.groupby(["A", "B"])
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
|
||||
)
|
||||
def test_aggregate_str_func(tsframe, groupbyfunc):
|
||||
grouped = tsframe.groupby(groupbyfunc)
|
||||
|
||||
# single series
|
||||
result = grouped["A"].agg("std")
|
||||
expected = grouped["A"].std()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# group frame by function name
|
||||
result = grouped.aggregate("var")
|
||||
expected = grouped.var()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# group frame by function dict
|
||||
result = grouped.agg(
|
||||
OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]])
|
||||
)
|
||||
expected = DataFrame(
|
||||
OrderedDict(
|
||||
[
|
||||
["A", grouped["A"].var()],
|
||||
["B", grouped["B"].std()],
|
||||
["C", grouped["C"].mean()],
|
||||
["D", grouped["D"].sem()],
|
||||
]
|
||||
)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_item_by_item(df):
|
||||
grouped = df.groupby("A")
|
||||
|
||||
aggfun = lambda ser: ser.size
|
||||
result = grouped.agg(aggfun)
|
||||
foo = (df.A == "foo").sum()
|
||||
bar = (df.A == "bar").sum()
|
||||
K = len(result.columns)
|
||||
|
||||
# GH5782
|
||||
# odd comparisons can result here, so cast to make easy
|
||||
exp = pd.Series(
|
||||
np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo"
|
||||
)
|
||||
tm.assert_series_equal(result.xs("foo"), exp)
|
||||
|
||||
exp = pd.Series(
|
||||
np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar"
|
||||
)
|
||||
tm.assert_almost_equal(result.xs("bar"), exp)
|
||||
|
||||
def aggfun(ser):
|
||||
return ser.size
|
||||
|
||||
result = DataFrame().groupby(df.A).agg(aggfun)
|
||||
assert isinstance(result, DataFrame)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_wrap_agg_out(three_group):
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
|
||||
def func(ser):
|
||||
if ser.dtype == np.object:
|
||||
raise TypeError
|
||||
else:
|
||||
return ser.sum()
|
||||
|
||||
result = grouped.aggregate(func)
|
||||
exp_grouped = three_group.loc[:, three_group.columns != "C"]
|
||||
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_multiple_functions_maintain_order(df):
|
||||
# GH #610
|
||||
funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
|
||||
result = df.groupby("A")["C"].agg(funcs)
|
||||
exp_cols = Index(["mean", "max", "min"])
|
||||
|
||||
tm.assert_index_equal(result.columns, exp_cols)
|
||||
|
||||
|
||||
def test_multiple_functions_tuples_and_non_tuples(df):
|
||||
# #1359
|
||||
funcs = [("foo", "mean"), "std"]
|
||||
ex_funcs = [("foo", "mean"), ("std", "std")]
|
||||
|
||||
result = df.groupby("A")["C"].agg(funcs)
|
||||
expected = df.groupby("A")["C"].agg(ex_funcs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").agg(funcs)
|
||||
expected = df.groupby("A").agg(ex_funcs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_more_flexible_frame_multi_function(df):
|
||||
grouped = df.groupby("A")
|
||||
|
||||
exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]]))
|
||||
exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]]))
|
||||
|
||||
expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
|
||||
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
|
||||
|
||||
d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]])
|
||||
result = grouped.aggregate(d)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# be careful
|
||||
result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]))
|
||||
expected = grouped.aggregate(
|
||||
OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def foo(x):
|
||||
return np.mean(x)
|
||||
|
||||
def bar(x):
|
||||
return np.std(x, ddof=1)
|
||||
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
d = OrderedDict(
|
||||
[["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]]
|
||||
)
|
||||
result = grouped.aggregate(d)
|
||||
|
||||
d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]])
|
||||
expected = grouped.aggregate(d)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_function_flexible_mix(df):
|
||||
# GH #1268
|
||||
grouped = df.groupby("A")
|
||||
|
||||
# Expected
|
||||
d = OrderedDict(
|
||||
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]]
|
||||
)
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
expected = grouped.aggregate(d)
|
||||
|
||||
# Test 1
|
||||
d = OrderedDict(
|
||||
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]]
|
||||
)
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped.aggregate(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test 2
|
||||
d = OrderedDict(
|
||||
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]]
|
||||
)
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped.aggregate(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_agg_coercing_bools():
|
||||
# issue 14873
|
||||
dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
|
||||
gp = dat.groupby("a")
|
||||
|
||||
index = Index([1, 2], name="a")
|
||||
|
||||
result = gp["b"].aggregate(lambda x: (x != 0).all())
|
||||
expected = Series([False, True], index=index, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gp["c"].aggregate(lambda x: x.isnull().all())
|
||||
expected = Series([True, False], index=index, name="c")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_order_aggregate_multiple_funcs():
|
||||
# GH 25692
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
|
||||
|
||||
res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
|
||||
result = res.columns.levels[1]
|
||||
|
||||
expected = pd.Index(["sum", "max", "mean", "ohlc", "min"])
|
||||
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
|
||||
@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
|
||||
def test_uint64_type_handling(dtype, how):
|
||||
# GH 26310
|
||||
df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]})
|
||||
expected = df.groupby("y").agg({"x": how})
|
||||
df.x = df.x.astype(dtype)
|
||||
result = df.groupby("y").agg({"x": how})
|
||||
result.x = result.x.astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
|
||||
class TestNamedAggregationSeries:
|
||||
def test_series_named_agg(self):
|
||||
df = pd.Series([1, 2, 3, 4])
|
||||
gr = df.groupby([0, 0, 1, 1])
|
||||
result = gr.agg(a="sum", b="min")
|
||||
expected = pd.DataFrame(
|
||||
{"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gr.agg(b="min", a="sum")
|
||||
# sort for 35 and earlier
|
||||
if compat.PY36:
|
||||
expected = expected[["b", "a"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_no_args_raises(self):
|
||||
gr = pd.Series([1, 2]).groupby([0, 1])
|
||||
with pytest.raises(TypeError, match="Must provide"):
|
||||
gr.agg()
|
||||
|
||||
# but we do allow this
|
||||
result = gr.agg([])
|
||||
expected = pd.DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_series_named_agg_duplicates_raises(self):
|
||||
# This is a limitation of the named agg implementation reusing
|
||||
# aggregate_multiple_funcs. It could maybe be lifted in the future.
|
||||
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
|
||||
with pytest.raises(SpecificationError):
|
||||
gr.agg(a="sum", b="sum")
|
||||
|
||||
def test_mangled(self):
|
||||
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
|
||||
result = gr.agg(a=lambda x: 0, b=lambda x: 1)
|
||||
expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNamedAggregationDataFrame:
|
||||
def test_agg_relabel(self):
|
||||
df = pd.DataFrame(
|
||||
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
||||
)
|
||||
result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
|
||||
expected = pd.DataFrame(
|
||||
{"a_max": [1, 3], "b_max": [6, 8]},
|
||||
index=pd.Index(["a", "b"], name="group"),
|
||||
columns=["a_max", "b_max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# order invariance
|
||||
p98 = functools.partial(np.percentile, q=98)
|
||||
result = df.groupby("group").agg(
|
||||
b_min=("B", "min"),
|
||||
a_min=("A", min),
|
||||
a_mean=("A", np.mean),
|
||||
a_max=("A", "max"),
|
||||
b_max=("B", "max"),
|
||||
a_98=("A", p98),
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"b_min": [5, 7],
|
||||
"a_min": [0, 2],
|
||||
"a_mean": [0.5, 2.5],
|
||||
"a_max": [1, 3],
|
||||
"b_max": [6, 8],
|
||||
"a_98": [0.98, 2.98],
|
||||
},
|
||||
index=pd.Index(["a", "b"], name="group"),
|
||||
columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
|
||||
)
|
||||
if not compat.PY36:
|
||||
expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_agg_relabel_non_identifier(self):
|
||||
df = pd.DataFrame(
|
||||
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
||||
)
|
||||
|
||||
result = df.groupby("group").agg(**{"my col": ("A", "max")})
|
||||
expected = pd.DataFrame(
|
||||
{"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_duplicate_raises(self):
|
||||
# TODO: we currently raise on multiple lambdas. We could *maybe*
|
||||
# update com.get_callable_name to append `_i` to each lambda.
|
||||
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
||||
with pytest.raises(SpecificationError, match="Function names"):
|
||||
df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
|
||||
|
||||
def test_agg_relabel_with_level(self):
|
||||
df = pd.DataFrame(
|
||||
{"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
|
||||
index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]),
|
||||
)
|
||||
result = df.groupby(level=0).agg(
|
||||
aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_agg_relabel_other_raises(self):
|
||||
df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
|
||||
grouped = df.groupby("A")
|
||||
match = "Must provide"
|
||||
with pytest.raises(TypeError, match=match):
|
||||
grouped.agg(foo=1)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
grouped.agg()
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
grouped.agg(a=("B", "max"), b=(1, 2, 3))
|
||||
|
||||
def test_missing_raises(self):
|
||||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
|
||||
with pytest.raises(KeyError, match="Column 'C' does not exist"):
|
||||
df.groupby("A").agg(c=("C", "sum"))
|
||||
|
||||
def test_agg_namedtuple(self):
|
||||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
|
||||
result = df.groupby("A").agg(
|
||||
b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
|
||||
)
|
||||
expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mangled(self):
|
||||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
|
||||
result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
|
||||
expected = pd.DataFrame(
|
||||
{"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestLambdaMangling:
|
||||
def test_maybe_mangle_lambdas_passthrough(self):
|
||||
assert _maybe_mangle_lambdas("mean") == "mean"
|
||||
assert _maybe_mangle_lambdas(lambda x: x).__name__ == "<lambda>"
|
||||
# don't mangel single lambda.
|
||||
assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "<lambda>"
|
||||
|
||||
def test_maybe_mangle_lambdas_listlike(self):
|
||||
aggfuncs = [lambda x: 1, lambda x: 2]
|
||||
result = _maybe_mangle_lambdas(aggfuncs)
|
||||
assert result[0].__name__ == "<lambda_0>"
|
||||
assert result[1].__name__ == "<lambda_1>"
|
||||
assert aggfuncs[0](None) == result[0](None)
|
||||
assert aggfuncs[1](None) == result[1](None)
|
||||
|
||||
def test_maybe_mangle_lambdas(self):
|
||||
func = {"A": [lambda x: 0, lambda x: 1]}
|
||||
result = _maybe_mangle_lambdas(func)
|
||||
assert result["A"][0].__name__ == "<lambda_0>"
|
||||
assert result["A"][1].__name__ == "<lambda_1>"
|
||||
|
||||
def test_maybe_mangle_lambdas_args(self):
|
||||
func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]}
|
||||
result = _maybe_mangle_lambdas(func)
|
||||
assert result["A"][0].__name__ == "<lambda_0>"
|
||||
assert result["A"][1].__name__ == "<lambda_1>"
|
||||
|
||||
assert func["A"][0](0, 1) == (0, 1, 1)
|
||||
assert func["A"][0](0, 1, 2) == (0, 1, 2)
|
||||
assert func["A"][0](0, 2, b=3) == (0, 2, 3)
|
||||
|
||||
def test_maybe_mangle_lambdas_named(self):
|
||||
func = OrderedDict(
|
||||
[("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))]
|
||||
)
|
||||
result = _maybe_mangle_lambdas(func)
|
||||
assert result == func
|
||||
|
||||
def test_basic(self):
|
||||
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
||||
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
|
||||
index=pd.Index([0, 1], name="A"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mangle_series_groupby(self):
|
||||
gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
|
||||
result = gr.agg([lambda x: 0, lambda x: 1])
|
||||
expected = pd.DataFrame({"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
|
||||
def test_with_kwargs(self):
|
||||
f1 = lambda x, y, b=1: x.sum() + y + b
|
||||
f2 = lambda x, y, b=2: x.sum() + y * b
|
||||
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
|
||||
expected = pd.DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
|
||||
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
|
||||
tm.assert_frame_equal(result, expected)
|
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range
|
||||
from pandas.core.groupby.groupby import DataError
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.randn(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack(dropna=False)
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.randint(0, 5, 50),
|
||||
"b": np.random.randint(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
|
||||
msg = "No numeric types to aggregate"
|
||||
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame.groupby("a")["b"].mean()
|
||||
|
||||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame[["b"]].groupby(frame["a"]).mean()
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.randint(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="T"),
|
||||
}
|
||||
)
|
||||
msg = "No numeric types to aggregate"
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame.groupby("b").dates.mean()
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
||||
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
expected = grouped.agg(np.sum)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("add", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.randn(1000))
|
||||
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op)
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = pd.DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = pd.DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = range(0, 25, 5)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"add"
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = pd.DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod"
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
@@ -0,0 +1,607 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
from pandas.core.groupby.groupby import SpecificationError
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_api():
|
||||
# GH 6337
|
||||
# http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
|
||||
# different api for agg when passed custom function with mixed frame
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.randn(5),
|
||||
"data2": np.random.randn(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
expected = grouped.agg([peak_to_peak])
|
||||
expected.columns = ["data1", "data2"]
|
||||
result = grouped.agg(peak_to_peak)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate(np.sum)
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate(np.sum)
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.randn(3, 2), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.rand(len(index)), index=index)
|
||||
s2 = Series(np.random.rand(len(index)), index=index)
|
||||
series = [("s1", s1), ("s2", s2)]
|
||||
df = DataFrame.from_dict(OrderedDict(series))
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="H"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = pd.DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg([np.sum, np.mean])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg([np.sum, np.mean])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
|
||||
expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w:
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
assert "using a dict with renaming" in str(w[0].message)
|
||||
assert "named aggregation" in str(w[0].message)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning) as w:
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
assert "using a dict on a Series for aggregation" in str(w[0].message)
|
||||
assert "named aggregation instead." in str(w[0].message)
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
|
||||
expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")])
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g["D"].agg({"C": ["sum", "std"]})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
|
||||
expected.columns = ["C", "D"]
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g["D"].agg({"C": "sum", "D": "std"})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"cannot perform renaming for r[1-2] with a nested dictionary"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
expected = pd.concat(
|
||||
[g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1
|
||||
)
|
||||
expected.columns = pd.MultiIndex.from_tuples(
|
||||
[("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
expected = g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
expected = expected.rename(columns={"result1": "D"})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.randint(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = tm.makeTimeSeries()
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg(np.sum)
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.randn(11),
|
||||
"E": np.random.randn(11),
|
||||
"F": np.random.randn(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
try:
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
except Exception:
|
||||
return np.nan
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg(sum)
|
||||
for ecall in equiv_callables:
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
result = df.groupby("category").agg(sum)
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = pd.Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = pd.DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_column
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = pd.DataFrame(
|
||||
{"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}
|
||||
)
|
||||
|
||||
result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific")
|
||||
for i in range(1, 5)
|
||||
]
|
||||
df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = pd.Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(
|
||||
lambda x: tuple(x),
|
||||
pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
||||
),
|
||||
(
|
||||
lambda x: list(x),
|
||||
pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = pd.DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
(lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = pd.DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = pd.DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = pd.Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = pd.DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}
|
||||
)
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = pd.DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = pd.DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=pd.Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
Reference in New Issue
Block a user