8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,562 @@
"""
test .agg behavior / note that .apply is tested generally in test_groupby.py
"""
from collections import OrderedDict
import functools
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
from pandas.core.base import SpecificationError
from pandas.core.groupby.generic import _maybe_mangle_lambdas
from pandas.core.groupby.grouper import Grouping
import pandas.util.testing as tm
def test_agg_regression1(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_agg_must_agg(df):
grouped = df.groupby("A")["C"]
msg = "Must produce aggregated value"
with pytest.raises(Exception, match=msg):
grouped.agg(lambda x: x.describe())
with pytest.raises(Exception, match=msg):
grouped.agg(lambda x: x.index[:2])
def test_agg_ser_multi_key(df):
# TODO(wesm): unused
ser = df.C # noqa
f = lambda x: x.sum()
results = df.C.groupby([df.A, df.B]).aggregate(f)
expected = df.groupby(["A", "B"]).sum()["C"]
tm.assert_series_equal(results, expected)
def test_groupby_aggregation_mixed_dtype():
# GH 6212
expected = DataFrame(
{
"v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
"v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
},
index=MultiIndex.from_tuples(
[
(1, 95),
(1, 99),
(2, 95),
(2, 99),
("big", "damp"),
("blue", "dry"),
("red", "red"),
("red", "wet"),
],
names=["by1", "by2"],
),
)
df = DataFrame(
{
"v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
"v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
"by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
"by2": [
"wet",
"dry",
99,
95,
np.nan,
"damp",
95,
99,
"red",
99,
np.nan,
np.nan,
],
}
)
g = df.groupby(["by1", "by2"])
result = g[["v1", "v2"]].mean()
tm.assert_frame_equal(result, expected)
def test_agg_apply_corner(ts, tsframe):
# nothing to group, all NA
grouped = ts.groupby(ts * np.nan)
assert ts.dtype == np.float64
# groupby float64 values results in Float64Index
exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64))
tm.assert_series_equal(grouped.sum(), exp)
tm.assert_series_equal(grouped.agg(np.sum), exp)
tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
# DataFrame
grouped = tsframe.groupby(tsframe["A"] * np.nan)
exp_df = DataFrame(
columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)
)
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
def test_agg_grouping_is_list_tuple(ts):
df = tm.makeTimeDataFrame()
grouped = df.groupby(lambda x: x.year)
grouper = grouped.grouper.groupings[0].grouper
grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_agg_python_multiindex(mframe):
grouped = mframe.groupby(["A", "B"])
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
)
def test_aggregate_str_func(tsframe, groupbyfunc):
grouped = tsframe.groupby(groupbyfunc)
# single series
result = grouped["A"].agg("std")
expected = grouped["A"].std()
tm.assert_series_equal(result, expected)
# group frame by function name
result = grouped.aggregate("var")
expected = grouped.var()
tm.assert_frame_equal(result, expected)
# group frame by function dict
result = grouped.agg(
OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]])
)
expected = DataFrame(
OrderedDict(
[
["A", grouped["A"].var()],
["B", grouped["B"].std()],
["C", grouped["C"].mean()],
["D", grouped["D"].sem()],
]
)
)
tm.assert_frame_equal(result, expected)
def test_aggregate_item_by_item(df):
grouped = df.groupby("A")
aggfun = lambda ser: ser.size
result = grouped.agg(aggfun)
foo = (df.A == "foo").sum()
bar = (df.A == "bar").sum()
K = len(result.columns)
# GH5782
# odd comparisons can result here, so cast to make easy
exp = pd.Series(
np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo"
)
tm.assert_series_equal(result.xs("foo"), exp)
exp = pd.Series(
np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar"
)
tm.assert_almost_equal(result.xs("bar"), exp)
def aggfun(ser):
return ser.size
result = DataFrame().groupby(df.A).agg(aggfun)
assert isinstance(result, DataFrame)
assert len(result) == 0
def test_wrap_agg_out(three_group):
grouped = three_group.groupby(["A", "B"])
def func(ser):
if ser.dtype == np.object:
raise TypeError
else:
return ser.sum()
result = grouped.aggregate(func)
exp_grouped = three_group.loc[:, three_group.columns != "C"]
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
tm.assert_frame_equal(result, expected)
def test_agg_multiple_functions_maintain_order(df):
# GH #610
funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
result = df.groupby("A")["C"].agg(funcs)
exp_cols = Index(["mean", "max", "min"])
tm.assert_index_equal(result.columns, exp_cols)
def test_multiple_functions_tuples_and_non_tuples(df):
# #1359
funcs = [("foo", "mean"), "std"]
ex_funcs = [("foo", "mean"), ("std", "std")]
result = df.groupby("A")["C"].agg(funcs)
expected = df.groupby("A")["C"].agg(ex_funcs)
tm.assert_frame_equal(result, expected)
result = df.groupby("A").agg(funcs)
expected = df.groupby("A").agg(ex_funcs)
tm.assert_frame_equal(result, expected)
def test_more_flexible_frame_multi_function(df):
grouped = df.groupby("A")
exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]]))
exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]]))
expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]])
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
# be careful
result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]))
expected = grouped.aggregate(
OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])
)
tm.assert_frame_equal(result, expected)
def foo(x):
return np.mean(x)
def bar(x):
return np.std(x, ddof=1)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
d = OrderedDict(
[["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]]
)
result = grouped.aggregate(d)
d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]])
expected = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
def test_multi_function_flexible_mix(df):
# GH #1268
grouped = df.groupby("A")
# Expected
d = OrderedDict(
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]]
)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
expected = grouped.aggregate(d)
# Test 1
d = OrderedDict(
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]]
)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
# Test 2
d = OrderedDict(
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]]
)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
def test_groupby_agg_coercing_bools():
# issue 14873
dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
gp = dat.groupby("a")
index = Index([1, 2], name="a")
result = gp["b"].aggregate(lambda x: (x != 0).all())
expected = Series([False, True], index=index, name="b")
tm.assert_series_equal(result, expected)
result = gp["c"].aggregate(lambda x: x.isnull().all())
expected = Series([True, False], index=index, name="c")
tm.assert_series_equal(result, expected)
def test_order_aggregate_multiple_funcs():
# GH 25692
df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
result = res.columns.levels[1]
expected = pd.Index(["sum", "max", "mean", "ohlc", "min"])
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
def test_uint64_type_handling(dtype, how):
# GH 26310
df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]})
expected = df.groupby("y").agg({"x": how})
df.x = df.x.astype(dtype)
result = df.groupby("y").agg({"x": how})
result.x = result.x.astype(np.int64)
tm.assert_frame_equal(result, expected, check_exact=True)
class TestNamedAggregationSeries:
def test_series_named_agg(self):
df = pd.Series([1, 2, 3, 4])
gr = df.groupby([0, 0, 1, 1])
result = gr.agg(a="sum", b="min")
expected = pd.DataFrame(
{"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1]
)
tm.assert_frame_equal(result, expected)
result = gr.agg(b="min", a="sum")
# sort for 35 and earlier
if compat.PY36:
expected = expected[["b", "a"]]
tm.assert_frame_equal(result, expected)
def test_no_args_raises(self):
gr = pd.Series([1, 2]).groupby([0, 1])
with pytest.raises(TypeError, match="Must provide"):
gr.agg()
# but we do allow this
result = gr.agg([])
expected = pd.DataFrame()
tm.assert_frame_equal(result, expected)
def test_series_named_agg_duplicates_raises(self):
# This is a limitation of the named agg implementation reusing
# aggregate_multiple_funcs. It could maybe be lifted in the future.
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
with pytest.raises(SpecificationError):
gr.agg(a="sum", b="sum")
def test_mangled(self):
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
result = gr.agg(a=lambda x: 0, b=lambda x: 1)
expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]})
tm.assert_frame_equal(result, expected)
class TestNamedAggregationDataFrame:
def test_agg_relabel(self):
df = pd.DataFrame(
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
)
result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
expected = pd.DataFrame(
{"a_max": [1, 3], "b_max": [6, 8]},
index=pd.Index(["a", "b"], name="group"),
columns=["a_max", "b_max"],
)
tm.assert_frame_equal(result, expected)
# order invariance
p98 = functools.partial(np.percentile, q=98)
result = df.groupby("group").agg(
b_min=("B", "min"),
a_min=("A", min),
a_mean=("A", np.mean),
a_max=("A", "max"),
b_max=("B", "max"),
a_98=("A", p98),
)
expected = pd.DataFrame(
{
"b_min": [5, 7],
"a_min": [0, 2],
"a_mean": [0.5, 2.5],
"a_max": [1, 3],
"b_max": [6, 8],
"a_98": [0.98, 2.98],
},
index=pd.Index(["a", "b"], name="group"),
columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
)
if not compat.PY36:
expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]]
tm.assert_frame_equal(result, expected)
def test_agg_relabel_non_identifier(self):
df = pd.DataFrame(
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
)
result = df.groupby("group").agg(**{"my col": ("A", "max")})
expected = pd.DataFrame(
{"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group")
)
tm.assert_frame_equal(result, expected)
def test_duplicate_raises(self):
# TODO: we currently raise on multiple lambdas. We could *maybe*
# update com.get_callable_name to append `_i` to each lambda.
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
with pytest.raises(SpecificationError, match="Function names"):
df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
def test_agg_relabel_with_level(self):
df = pd.DataFrame(
{"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]),
)
result = df.groupby(level=0).agg(
aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
)
expected = pd.DataFrame(
{"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)
def test_agg_relabel_other_raises(self):
df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
grouped = df.groupby("A")
match = "Must provide"
with pytest.raises(TypeError, match=match):
grouped.agg(foo=1)
with pytest.raises(TypeError, match=match):
grouped.agg()
with pytest.raises(TypeError, match=match):
grouped.agg(a=("B", "max"), b=(1, 2, 3))
def test_missing_raises(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
with pytest.raises(KeyError, match="Column 'C' does not exist"):
df.groupby("A").agg(c=("C", "sum"))
def test_agg_namedtuple(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
result = df.groupby("A").agg(
b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
)
expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
tm.assert_frame_equal(result, expected)
def test_mangled(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
expected = pd.DataFrame(
{"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A")
)
tm.assert_frame_equal(result, expected)
class TestLambdaMangling:
def test_maybe_mangle_lambdas_passthrough(self):
assert _maybe_mangle_lambdas("mean") == "mean"
assert _maybe_mangle_lambdas(lambda x: x).__name__ == "<lambda>"
# don't mangel single lambda.
assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "<lambda>"
def test_maybe_mangle_lambdas_listlike(self):
aggfuncs = [lambda x: 1, lambda x: 2]
result = _maybe_mangle_lambdas(aggfuncs)
assert result[0].__name__ == "<lambda_0>"
assert result[1].__name__ == "<lambda_1>"
assert aggfuncs[0](None) == result[0](None)
assert aggfuncs[1](None) == result[1](None)
def test_maybe_mangle_lambdas(self):
func = {"A": [lambda x: 0, lambda x: 1]}
result = _maybe_mangle_lambdas(func)
assert result["A"][0].__name__ == "<lambda_0>"
assert result["A"][1].__name__ == "<lambda_1>"
def test_maybe_mangle_lambdas_args(self):
func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]}
result = _maybe_mangle_lambdas(func)
assert result["A"][0].__name__ == "<lambda_0>"
assert result["A"][1].__name__ == "<lambda_1>"
assert func["A"][0](0, 1) == (0, 1, 1)
assert func["A"][0](0, 1, 2) == (0, 1, 2)
assert func["A"][0](0, 2, b=3) == (0, 2, 3)
def test_maybe_mangle_lambdas_named(self):
func = OrderedDict(
[("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))]
)
result = _maybe_mangle_lambdas(func)
assert result == func
def test_basic(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
expected = pd.DataFrame(
{("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
index=pd.Index([0, 1], name="A"),
)
tm.assert_frame_equal(result, expected)
def test_mangle_series_groupby(self):
gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
result = gr.agg([lambda x: 0, lambda x: 1])
expected = pd.DataFrame({"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]})
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
expected = pd.DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
tm.assert_frame_equal(result, expected)
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,238 @@
"""
test cython .agg behavior
"""
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range
from pandas.core.groupby.groupby import DataError
import pandas.util.testing as tm
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
pytest.param(
"median",
# ignore mean of empty slice
# and all-NaN
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
),
"prod",
"min",
"max",
],
)
def test_cythonized_aggers(op_name):
data = {
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
"B": ["A", "B"] * 6,
"C": np.random.randn(12),
}
df = DataFrame(data)
df.loc[2:10:2, "C"] = np.nan
op = lambda x: getattr(x, op_name)()
# single column
grouped = df.drop(["B"], axis=1).groupby("A")
exp = {cat: op(group["C"]) for cat, group in grouped}
exp = DataFrame({"C": exp})
exp.index.name = "A"
result = op(grouped)
tm.assert_frame_equal(result, exp)
# multiple columns
grouped = df.groupby(["A", "B"])
expd = {}
for (cat1, cat2), group in grouped:
expd.setdefault(cat1, {})[cat2] = op(group["C"])
exp = DataFrame(expd).T.stack(dropna=False)
exp.index.names = ["A", "B"]
exp.name = "C"
result = op(grouped)["C"]
if op_name in ["sum", "prod"]:
tm.assert_series_equal(result, exp)
def test_cython_agg_boolean():
frame = DataFrame(
{
"a": np.random.randint(0, 5, 50),
"b": np.random.randint(0, 2, 50).astype("bool"),
}
)
result = frame.groupby("a")["b"].mean()
expected = frame.groupby("a")["b"].agg(np.mean)
tm.assert_series_equal(result, expected)
def test_cython_agg_nothing_to_agg():
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame.groupby("a")["b"].mean()
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
with pytest.raises(DataError, match=msg):
frame[["b"]].groupby(frame["a"]).mean()
def test_cython_agg_nothing_to_agg_with_dates():
frame = DataFrame(
{
"a": np.random.randint(0, 5, 50),
"b": ["foo", "bar"] * 25,
"dates": pd.date_range("now", periods=50, freq="T"),
}
)
msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame.groupby("b").dates.mean()
def test_cython_agg_frame_columns():
# #2113
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
def test_cython_agg_return_dict():
# GH 16741
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
expected = Series(
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
index=Index(["bar", "foo"], name="A"),
name="B",
)
tm.assert_series_equal(ts, expected)
def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
expected = grouped.agg(np.sum)
tm.assert_series_equal(summed, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", np.median),
("var", np.var),
("add", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
],
)
def test__cython_agg_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)
result = df.groupby(labels)._cython_agg_general(op)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
("var", lambda x: np.var(x, ddof=1)),
("min", np.min),
("max", np.max),
],
)
def test_cython_agg_empty_buckets(op, targop, observed):
df = pd.DataFrame([11, 12, 13])
grps = range(0, 55, 5)
# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op)
g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
tm.assert_frame_equal(result, expected)
def test_cython_agg_empty_buckets_nanops(observed):
# GH-18869 can't call nanops on empty groups, so hardcode expected
# for these
df = pd.DataFrame([11, 12, 13], columns=["a"])
grps = range(0, 25, 5)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"add"
)
intervals = pd.interval_range(0, 20, freq=5)
expected = pd.DataFrame(
{"a": [0, 0, 36, 0]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 0]
tm.assert_frame_equal(result, expected)
# prod
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"prod"
)
expected = pd.DataFrame(
{"a": [1, 1, 1716, 1]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 1]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
@pytest.mark.parametrize(
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
)
def test_cython_with_timestamp_and_nat(op, data):
# https://github.com/pandas-dev/pandas/issues/19526
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
index = Index([0, 1], name="a")
# We will group by a and test the cython aggregations
expected = DataFrame({"b": [data, NaT]}, index=index)
result = df.groupby("a").aggregate(op)
tm.assert_frame_equal(expected, result)

View File

@@ -0,0 +1,607 @@
"""
test all other .agg behavior
"""
from collections import OrderedDict
import datetime as dt
from functools import partial
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
date_range,
period_range,
)
from pandas.core.groupby.groupby import SpecificationError
import pandas.util.testing as tm
from pandas.io.formats.printing import pprint_thing
def test_agg_api():
# GH 6337
# http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
# different api for agg when passed custom function with mixed frame
df = DataFrame(
{
"data1": np.random.randn(5),
"data2": np.random.randn(5),
"key1": ["a", "a", "b", "b", "a"],
"key2": ["one", "two", "one", "two", "one"],
}
)
grouped = df.groupby("key1")
def peak_to_peak(arr):
return arr.max() - arr.min()
expected = grouped.agg([peak_to_peak])
expected.columns = ["data1", "data2"]
result = grouped.agg(peak_to_peak)
tm.assert_frame_equal(result, expected)
def test_agg_datetimes_mixed():
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
df1 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
data = [
[
row[0],
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
row[2],
]
for row in data
]
df2 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
df1["weights"] = df1["value"] / df1["value"].sum()
gb1 = df1.groupby("date").aggregate(np.sum)
df2["weights"] = df1["value"] / df1["value"].sum()
gb2 = df2.groupby("date").aggregate(np.sum)
assert len(gb1) == len(gb2)
def test_agg_period_index():
prng = period_range("2012-1-1", freq="M", periods=3)
df = DataFrame(np.random.randn(3, 2), index=prng)
rs = df.groupby(level=0).sum()
assert isinstance(rs.index, PeriodIndex)
# GH 3579
index = period_range(start="1999-01", periods=5, freq="M")
s1 = Series(np.random.rand(len(index)), index=index)
s2 = Series(np.random.rand(len(index)), index=index)
series = [("s1", s1), ("s2", s2)]
df = DataFrame.from_dict(OrderedDict(series))
grouped = df.groupby(df.index.month)
list(grouped)
def test_agg_dict_parameter_cast_result_dtypes():
# GH 12821
df = DataFrame(
{
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
"time": date_range("1/1/2011", periods=8, freq="H"),
}
)
df.loc[[0, 1, 2, 5], "time"] = None
# test for `first` function
exp = df.loc[[0, 3, 4, 6]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.first(), exp)
tm.assert_frame_equal(grouped.agg("first"), exp)
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
tm.assert_series_equal(grouped.time.first(), exp["time"])
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.last(), exp)
tm.assert_frame_equal(grouped.agg("last"), exp)
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
tm.assert_series_equal(grouped.time.last(), exp["time"])
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
# count
exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.agg(len), exp)
tm.assert_series_equal(grouped.time.size(), exp)
exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.count(), exp)
def test_agg_cast_results_dtypes():
# similar to GH12821
# xref #11444
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
v = list("aaabbbbbbccd")
df = pd.DataFrame({"X": v, "Y": u})
result = df.groupby("X")["Y"].agg(len)
expected = df.groupby("X")["Y"].count()
tm.assert_series_equal(result, expected)
def test_aggregate_float64_no_int64():
# see gh-11199
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a"]].mean()
tm.assert_frame_equal(result, expected)
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a", "c"]].mean()
tm.assert_frame_equal(result, expected)
def test_aggregate_api_consistency():
# GH 9052
# make sure that the aggregates via dict
# are consistent
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
grouped = df.groupby(["A", "B"])
c_mean = grouped["C"].mean()
c_sum = grouped["C"].sum()
d_mean = grouped["D"].mean()
d_sum = grouped["D"].sum()
result = grouped["D"].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean], axis=1)
expected.columns = ["sum", "mean"]
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg([np.sum, np.mean])
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped[["D", "C"]].agg([np.sum, np.mean])
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": "mean", "D": "sum"})
expected = pd.concat([d_sum, c_mean], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]])
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_dict_renaming_deprecation():
# 15931
df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w:
df.groupby("A").agg(
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
)
assert "using a dict with renaming" in str(w[0].message)
assert "named aggregation" in str(w[0].message)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
with tm.assert_produces_warning(FutureWarning) as w:
df.groupby("A").B.agg({"foo": "count"})
assert "using a dict on a Series for aggregation" in str(w[0].message)
assert "named aggregation instead." in str(w[0].message)
def test_agg_compat():
# GH 12334
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")])
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g["D"].agg({"C": ["sum", "std"]})
tm.assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
expected.columns = ["C", "D"]
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g["D"].agg({"C": "sum", "D": "std"})
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_nested_dicts():
# API change for disallowing these types of nested dicts
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"cannot perform renaming for r[1-2] with a nested dictionary"
with pytest.raises(SpecificationError, match=msg):
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
expected = pd.concat(
[g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1
)
expected.columns = pd.MultiIndex.from_tuples(
[("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")]
)
tm.assert_frame_equal(result, expected, check_like=True)
# same name as the original column
# GH9052
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
expected = g["D"].agg({"result1": np.sum, "result2": np.mean})
expected = expected.rename(columns={"result1": "D"})
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g["D"].agg({"D": np.sum, "result2": np.mean})
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_item_by_item_raise_typeerror():
df = DataFrame(np.random.randint(10, size=(20, 10)))
def raiseException(df):
pprint_thing("----------------------------------------")
pprint_thing(df.to_string())
raise TypeError("test")
with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
def test_series_agg_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.sum)
expected = grouped.sum()
tm.assert_series_equal(result, expected)
def test_series_agg_multi_pure_python():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
def bad(x):
assert len(x.values.base) > 0
return "foo"
result = data.groupby(["A", "B"]).agg(bad)
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
tm.assert_frame_equal(result, expected)
def test_agg_consistency():
# agg with ([]) and () not consistent
# GH 6715
def P1(a):
try:
return np.percentile(a.dropna(), q=1)
except Exception:
return np.nan
df = DataFrame(
{
"col1": [1, 2, 3, 4],
"col2": [10, 25, 26, 31],
"date": [
dt.date(2013, 2, 10),
dt.date(2013, 2, 10),
dt.date(2013, 2, 11),
dt.date(2013, 2, 11),
],
}
)
g = df.groupby("date")
expected = g.agg([P1])
expected.columns = expected.columns.levels[0]
result = g.agg(P1)
tm.assert_frame_equal(result, expected)
def test_agg_callables():
# GH 7929
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
class fn_class:
def __call__(self, x):
return sum(x)
equiv_callables = [
sum,
np.sum,
lambda x: sum(x),
lambda x: x.sum(),
partial(sum),
fn_class(),
]
expected = df.groupby("foo").agg(sum)
for ecall in equiv_callables:
result = df.groupby("foo").agg(ecall)
tm.assert_frame_equal(result, expected)
def test_agg_over_numpy_arrays():
# GH 3788
df = pd.DataFrame(
[
[1, np.array([10, 20, 30])],
[1, np.array([40, 50, 60])],
[2, np.array([20, 30, 40])],
],
columns=["category", "arraydata"],
)
result = df.groupby("category").agg(sum)
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
expected_index = pd.Index([1, 2], name="category")
expected_column = ["arraydata"]
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_column
)
tm.assert_frame_equal(result, expected)
def test_agg_timezone_round_trip():
# GH 15426
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
df = pd.DataFrame(
{"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}
)
result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
result3 = df.groupby("a")["b"].min().iloc[0]
assert result1 == ts
assert result2 == ts
assert result3 == ts
dates = [
pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific")
for i in range(1, 5)
]
df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates})
grouped = df.groupby("A")
ts = df["B"].iloc[0]
assert ts == grouped.nth(0)["B"].iloc[0]
assert ts == grouped.head(1)["B"].iloc[0]
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
def test_sum_uint64_overflow():
# see gh-14758
# Convert to uint64 and don't overflow
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
df = df + 9223372036854775807
index = pd.Index(
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
)
expected = pd.DataFrame(
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
index=index,
)
expected.index.name = 0
result = df.groupby(0).sum()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
(list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
(
lambda x: tuple(x),
pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
),
(
lambda x: list(x),
pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
),
],
)
def test_agg_structs_dataframe(structure, expected):
df = pd.DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby(["A", "B"]).aggregate(structure)
expected.index.names = ["A", "B"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
(lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
],
)
def test_agg_structs_series(structure, expected):
# Issue #18079
df = pd.DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby("A")["C"].aggregate(structure)
expected.index.name = "A"
tm.assert_series_equal(result, expected)
def test_agg_category_nansum(observed):
categories = ["a", "b", "c"]
df = pd.DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
)
result = df.groupby("A", observed=observed).B.agg(np.nansum)
expected = pd.Series(
[3, 3, 0],
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
name="B",
)
if observed:
expected = expected[expected != 0]
tm.assert_series_equal(result, expected)
def test_agg_list_like_func():
# GH 18473
df = pd.DataFrame(
{"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}
)
grouped = df.groupby("A", as_index=False, sort=False)
result = grouped.agg({"B": lambda x: list(x)})
expected = pd.DataFrame(
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
)
tm.assert_frame_equal(result, expected)
def test_agg_lambda_with_timezone():
# GH 23683
df = pd.DataFrame(
{
"tag": [1, 1],
"date": [
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
],
}
)
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
expected = pd.DataFrame(
[pd.Timestamp("2018-01-01", tz="UTC")],
index=pd.Index([1], name="tag"),
columns=["date"],
)
tm.assert_frame_equal(result, expected)