8th day of python challenges 111-117
This commit is contained in:
@@ -0,0 +1,562 @@
|
||||
"""
|
||||
test .agg behavior / note that .apply is tested generally in test_groupby.py
|
||||
"""
|
||||
from collections import OrderedDict
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
|
||||
from pandas.core.base import SpecificationError
|
||||
from pandas.core.groupby.generic import _maybe_mangle_lambdas
|
||||
from pandas.core.groupby.grouper import Grouping
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_agg_regression1(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_must_agg(df):
|
||||
grouped = df.groupby("A")["C"]
|
||||
|
||||
msg = "Must produce aggregated value"
|
||||
with pytest.raises(Exception, match=msg):
|
||||
grouped.agg(lambda x: x.describe())
|
||||
with pytest.raises(Exception, match=msg):
|
||||
grouped.agg(lambda x: x.index[:2])
|
||||
|
||||
|
||||
def test_agg_ser_multi_key(df):
|
||||
# TODO(wesm): unused
|
||||
ser = df.C # noqa
|
||||
|
||||
f = lambda x: x.sum()
|
||||
results = df.C.groupby([df.A, df.B]).aggregate(f)
|
||||
expected = df.groupby(["A", "B"]).sum()["C"]
|
||||
tm.assert_series_equal(results, expected)
|
||||
|
||||
|
||||
def test_groupby_aggregation_mixed_dtype():
|
||||
|
||||
# GH 6212
|
||||
expected = DataFrame(
|
||||
{
|
||||
"v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
|
||||
"v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[
|
||||
(1, 95),
|
||||
(1, 99),
|
||||
(2, 95),
|
||||
(2, 99),
|
||||
("big", "damp"),
|
||||
("blue", "dry"),
|
||||
("red", "red"),
|
||||
("red", "wet"),
|
||||
],
|
||||
names=["by1", "by2"],
|
||||
),
|
||||
)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
|
||||
"v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
|
||||
"by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
|
||||
"by2": [
|
||||
"wet",
|
||||
"dry",
|
||||
99,
|
||||
95,
|
||||
np.nan,
|
||||
"damp",
|
||||
95,
|
||||
99,
|
||||
"red",
|
||||
99,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["by1", "by2"])
|
||||
result = g[["v1", "v2"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_apply_corner(ts, tsframe):
|
||||
# nothing to group, all NA
|
||||
grouped = ts.groupby(ts * np.nan)
|
||||
assert ts.dtype == np.float64
|
||||
|
||||
# groupby float64 values results in Float64Index
|
||||
exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64))
|
||||
tm.assert_series_equal(grouped.sum(), exp)
|
||||
tm.assert_series_equal(grouped.agg(np.sum), exp)
|
||||
tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
|
||||
|
||||
# DataFrame
|
||||
grouped = tsframe.groupby(tsframe["A"] * np.nan)
|
||||
exp_df = DataFrame(
|
||||
columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)
|
||||
)
|
||||
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
|
||||
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
|
||||
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
|
||||
|
||||
|
||||
def test_agg_grouping_is_list_tuple(ts):
|
||||
df = tm.makeTimeDataFrame()
|
||||
|
||||
grouped = df.groupby(lambda x: x.year)
|
||||
grouper = grouped.grouper.groupings[0].grouper
|
||||
grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_python_multiindex(mframe):
|
||||
grouped = mframe.groupby(["A", "B"])
|
||||
|
||||
result = grouped.agg(np.mean)
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
|
||||
)
|
||||
def test_aggregate_str_func(tsframe, groupbyfunc):
|
||||
grouped = tsframe.groupby(groupbyfunc)
|
||||
|
||||
# single series
|
||||
result = grouped["A"].agg("std")
|
||||
expected = grouped["A"].std()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# group frame by function name
|
||||
result = grouped.aggregate("var")
|
||||
expected = grouped.var()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# group frame by function dict
|
||||
result = grouped.agg(
|
||||
OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]])
|
||||
)
|
||||
expected = DataFrame(
|
||||
OrderedDict(
|
||||
[
|
||||
["A", grouped["A"].var()],
|
||||
["B", grouped["B"].std()],
|
||||
["C", grouped["C"].mean()],
|
||||
["D", grouped["D"].sem()],
|
||||
]
|
||||
)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_item_by_item(df):
|
||||
grouped = df.groupby("A")
|
||||
|
||||
aggfun = lambda ser: ser.size
|
||||
result = grouped.agg(aggfun)
|
||||
foo = (df.A == "foo").sum()
|
||||
bar = (df.A == "bar").sum()
|
||||
K = len(result.columns)
|
||||
|
||||
# GH5782
|
||||
# odd comparisons can result here, so cast to make easy
|
||||
exp = pd.Series(
|
||||
np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo"
|
||||
)
|
||||
tm.assert_series_equal(result.xs("foo"), exp)
|
||||
|
||||
exp = pd.Series(
|
||||
np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar"
|
||||
)
|
||||
tm.assert_almost_equal(result.xs("bar"), exp)
|
||||
|
||||
def aggfun(ser):
|
||||
return ser.size
|
||||
|
||||
result = DataFrame().groupby(df.A).agg(aggfun)
|
||||
assert isinstance(result, DataFrame)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_wrap_agg_out(three_group):
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
|
||||
def func(ser):
|
||||
if ser.dtype == np.object:
|
||||
raise TypeError
|
||||
else:
|
||||
return ser.sum()
|
||||
|
||||
result = grouped.aggregate(func)
|
||||
exp_grouped = three_group.loc[:, three_group.columns != "C"]
|
||||
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_multiple_functions_maintain_order(df):
|
||||
# GH #610
|
||||
funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
|
||||
result = df.groupby("A")["C"].agg(funcs)
|
||||
exp_cols = Index(["mean", "max", "min"])
|
||||
|
||||
tm.assert_index_equal(result.columns, exp_cols)
|
||||
|
||||
|
||||
def test_multiple_functions_tuples_and_non_tuples(df):
|
||||
# #1359
|
||||
funcs = [("foo", "mean"), "std"]
|
||||
ex_funcs = [("foo", "mean"), ("std", "std")]
|
||||
|
||||
result = df.groupby("A")["C"].agg(funcs)
|
||||
expected = df.groupby("A")["C"].agg(ex_funcs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").agg(funcs)
|
||||
expected = df.groupby("A").agg(ex_funcs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_more_flexible_frame_multi_function(df):
|
||||
grouped = df.groupby("A")
|
||||
|
||||
exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]]))
|
||||
exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]]))
|
||||
|
||||
expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
|
||||
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
|
||||
|
||||
d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]])
|
||||
result = grouped.aggregate(d)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# be careful
|
||||
result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]))
|
||||
expected = grouped.aggregate(
|
||||
OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def foo(x):
|
||||
return np.mean(x)
|
||||
|
||||
def bar(x):
|
||||
return np.std(x, ddof=1)
|
||||
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
d = OrderedDict(
|
||||
[["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]]
|
||||
)
|
||||
result = grouped.aggregate(d)
|
||||
|
||||
d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]])
|
||||
expected = grouped.aggregate(d)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_function_flexible_mix(df):
|
||||
# GH #1268
|
||||
grouped = df.groupby("A")
|
||||
|
||||
# Expected
|
||||
d = OrderedDict(
|
||||
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]]
|
||||
)
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
expected = grouped.aggregate(d)
|
||||
|
||||
# Test 1
|
||||
d = OrderedDict(
|
||||
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]]
|
||||
)
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped.aggregate(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test 2
|
||||
d = OrderedDict(
|
||||
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]]
|
||||
)
|
||||
# this uses column selection & renaming
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped.aggregate(d)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_agg_coercing_bools():
|
||||
# issue 14873
|
||||
dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
|
||||
gp = dat.groupby("a")
|
||||
|
||||
index = Index([1, 2], name="a")
|
||||
|
||||
result = gp["b"].aggregate(lambda x: (x != 0).all())
|
||||
expected = Series([False, True], index=index, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gp["c"].aggregate(lambda x: x.isnull().all())
|
||||
expected = Series([True, False], index=index, name="c")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_order_aggregate_multiple_funcs():
|
||||
# GH 25692
|
||||
df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
|
||||
|
||||
res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
|
||||
result = res.columns.levels[1]
|
||||
|
||||
expected = pd.Index(["sum", "max", "mean", "ohlc", "min"])
|
||||
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
|
||||
@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
|
||||
def test_uint64_type_handling(dtype, how):
|
||||
# GH 26310
|
||||
df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]})
|
||||
expected = df.groupby("y").agg({"x": how})
|
||||
df.x = df.x.astype(dtype)
|
||||
result = df.groupby("y").agg({"x": how})
|
||||
result.x = result.x.astype(np.int64)
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
|
||||
class TestNamedAggregationSeries:
|
||||
def test_series_named_agg(self):
|
||||
df = pd.Series([1, 2, 3, 4])
|
||||
gr = df.groupby([0, 0, 1, 1])
|
||||
result = gr.agg(a="sum", b="min")
|
||||
expected = pd.DataFrame(
|
||||
{"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = gr.agg(b="min", a="sum")
|
||||
# sort for 35 and earlier
|
||||
if compat.PY36:
|
||||
expected = expected[["b", "a"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_no_args_raises(self):
|
||||
gr = pd.Series([1, 2]).groupby([0, 1])
|
||||
with pytest.raises(TypeError, match="Must provide"):
|
||||
gr.agg()
|
||||
|
||||
# but we do allow this
|
||||
result = gr.agg([])
|
||||
expected = pd.DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_series_named_agg_duplicates_raises(self):
|
||||
# This is a limitation of the named agg implementation reusing
|
||||
# aggregate_multiple_funcs. It could maybe be lifted in the future.
|
||||
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
|
||||
with pytest.raises(SpecificationError):
|
||||
gr.agg(a="sum", b="sum")
|
||||
|
||||
def test_mangled(self):
|
||||
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
|
||||
result = gr.agg(a=lambda x: 0, b=lambda x: 1)
|
||||
expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNamedAggregationDataFrame:
|
||||
def test_agg_relabel(self):
|
||||
df = pd.DataFrame(
|
||||
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
||||
)
|
||||
result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
|
||||
expected = pd.DataFrame(
|
||||
{"a_max": [1, 3], "b_max": [6, 8]},
|
||||
index=pd.Index(["a", "b"], name="group"),
|
||||
columns=["a_max", "b_max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# order invariance
|
||||
p98 = functools.partial(np.percentile, q=98)
|
||||
result = df.groupby("group").agg(
|
||||
b_min=("B", "min"),
|
||||
a_min=("A", min),
|
||||
a_mean=("A", np.mean),
|
||||
a_max=("A", "max"),
|
||||
b_max=("B", "max"),
|
||||
a_98=("A", p98),
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"b_min": [5, 7],
|
||||
"a_min": [0, 2],
|
||||
"a_mean": [0.5, 2.5],
|
||||
"a_max": [1, 3],
|
||||
"b_max": [6, 8],
|
||||
"a_98": [0.98, 2.98],
|
||||
},
|
||||
index=pd.Index(["a", "b"], name="group"),
|
||||
columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
|
||||
)
|
||||
if not compat.PY36:
|
||||
expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_agg_relabel_non_identifier(self):
|
||||
df = pd.DataFrame(
|
||||
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
|
||||
)
|
||||
|
||||
result = df.groupby("group").agg(**{"my col": ("A", "max")})
|
||||
expected = pd.DataFrame(
|
||||
{"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_duplicate_raises(self):
|
||||
# TODO: we currently raise on multiple lambdas. We could *maybe*
|
||||
# update com.get_callable_name to append `_i` to each lambda.
|
||||
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
||||
with pytest.raises(SpecificationError, match="Function names"):
|
||||
df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
|
||||
|
||||
def test_agg_relabel_with_level(self):
|
||||
df = pd.DataFrame(
|
||||
{"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
|
||||
index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]),
|
||||
)
|
||||
result = df.groupby(level=0).agg(
|
||||
aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_agg_relabel_other_raises(self):
|
||||
df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
|
||||
grouped = df.groupby("A")
|
||||
match = "Must provide"
|
||||
with pytest.raises(TypeError, match=match):
|
||||
grouped.agg(foo=1)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
grouped.agg()
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
grouped.agg(a=("B", "max"), b=(1, 2, 3))
|
||||
|
||||
def test_missing_raises(self):
|
||||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
|
||||
with pytest.raises(KeyError, match="Column 'C' does not exist"):
|
||||
df.groupby("A").agg(c=("C", "sum"))
|
||||
|
||||
def test_agg_namedtuple(self):
|
||||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
|
||||
result = df.groupby("A").agg(
|
||||
b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
|
||||
)
|
||||
expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mangled(self):
|
||||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
|
||||
result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
|
||||
expected = pd.DataFrame(
|
||||
{"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestLambdaMangling:
|
||||
def test_maybe_mangle_lambdas_passthrough(self):
|
||||
assert _maybe_mangle_lambdas("mean") == "mean"
|
||||
assert _maybe_mangle_lambdas(lambda x: x).__name__ == "<lambda>"
|
||||
# don't mangel single lambda.
|
||||
assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "<lambda>"
|
||||
|
||||
def test_maybe_mangle_lambdas_listlike(self):
|
||||
aggfuncs = [lambda x: 1, lambda x: 2]
|
||||
result = _maybe_mangle_lambdas(aggfuncs)
|
||||
assert result[0].__name__ == "<lambda_0>"
|
||||
assert result[1].__name__ == "<lambda_1>"
|
||||
assert aggfuncs[0](None) == result[0](None)
|
||||
assert aggfuncs[1](None) == result[1](None)
|
||||
|
||||
def test_maybe_mangle_lambdas(self):
|
||||
func = {"A": [lambda x: 0, lambda x: 1]}
|
||||
result = _maybe_mangle_lambdas(func)
|
||||
assert result["A"][0].__name__ == "<lambda_0>"
|
||||
assert result["A"][1].__name__ == "<lambda_1>"
|
||||
|
||||
def test_maybe_mangle_lambdas_args(self):
|
||||
func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]}
|
||||
result = _maybe_mangle_lambdas(func)
|
||||
assert result["A"][0].__name__ == "<lambda_0>"
|
||||
assert result["A"][1].__name__ == "<lambda_1>"
|
||||
|
||||
assert func["A"][0](0, 1) == (0, 1, 1)
|
||||
assert func["A"][0](0, 1, 2) == (0, 1, 2)
|
||||
assert func["A"][0](0, 2, b=3) == (0, 2, 3)
|
||||
|
||||
def test_maybe_mangle_lambdas_named(self):
|
||||
func = OrderedDict(
|
||||
[("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))]
|
||||
)
|
||||
result = _maybe_mangle_lambdas(func)
|
||||
assert result == func
|
||||
|
||||
def test_basic(self):
|
||||
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
|
||||
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
|
||||
index=pd.Index([0, 1], name="A"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mangle_series_groupby(self):
|
||||
gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
|
||||
result = gr.agg([lambda x: 0, lambda x: 1])
|
||||
expected = pd.DataFrame({"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
|
||||
def test_with_kwargs(self):
|
||||
f1 = lambda x, y, b=1: x.sum() + y + b
|
||||
f2 = lambda x, y, b=2: x.sum() + y * b
|
||||
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
|
||||
expected = pd.DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
|
||||
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range
|
||||
from pandas.core.groupby.groupby import DataError
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.randn(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack(dropna=False)
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.randint(0, 5, 50),
|
||||
"b": np.random.randint(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
|
||||
msg = "No numeric types to aggregate"
|
||||
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame.groupby("a")["b"].mean()
|
||||
|
||||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame[["b"]].groupby(frame["a"]).mean()
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.randint(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="T"),
|
||||
}
|
||||
)
|
||||
msg = "No numeric types to aggregate"
|
||||
with pytest.raises(DataError, match=msg):
|
||||
frame.groupby("b").dates.mean()
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
||||
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
expected = grouped.agg(np.sum)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("add", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.randn(1000))
|
||||
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op)
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = pd.DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = pd.DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = range(0, 25, 5)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"add"
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = pd.DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod"
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
@@ -0,0 +1,607 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
from pandas.core.groupby.groupby import SpecificationError
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_api():
|
||||
# GH 6337
|
||||
# http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
|
||||
# different api for agg when passed custom function with mixed frame
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.randn(5),
|
||||
"data2": np.random.randn(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
expected = grouped.agg([peak_to_peak])
|
||||
expected.columns = ["data1", "data2"]
|
||||
result = grouped.agg(peak_to_peak)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate(np.sum)
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate(np.sum)
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.randn(3, 2), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.rand(len(index)), index=index)
|
||||
s2 = Series(np.random.rand(len(index)), index=index)
|
||||
series = [("s1", s1), ("s2", s2)]
|
||||
df = DataFrame.from_dict(OrderedDict(series))
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="H"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = pd.DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg([np.sum, np.mean])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg([np.sum, np.mean])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
|
||||
expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w:
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
assert "using a dict with renaming" in str(w[0].message)
|
||||
assert "named aggregation" in str(w[0].message)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning) as w:
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
assert "using a dict on a Series for aggregation" in str(w[0].message)
|
||||
assert "named aggregation instead." in str(w[0].message)
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
|
||||
expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")])
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g["D"].agg({"C": ["sum", "std"]})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
|
||||
expected.columns = ["C", "D"]
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g["D"].agg({"C": "sum", "D": "std"})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.randn(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"cannot perform renaming for r[1-2] with a nested dictionary"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
expected = pd.concat(
|
||||
[g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1
|
||||
)
|
||||
expected.columns = pd.MultiIndex.from_tuples(
|
||||
[("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
expected = g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
expected = expected.rename(columns={"result1": "D"})
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.randint(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = tm.makeTimeSeries()
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg(np.sum)
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.randn(11),
|
||||
"E": np.random.randn(11),
|
||||
"F": np.random.randn(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
try:
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
except Exception:
|
||||
return np.nan
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg(sum)
|
||||
for ecall in equiv_callables:
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
result = df.groupby("category").agg(sum)
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = pd.Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = pd.DataFrame(
|
||||
expected_data, index=expected_index, columns=expected_column
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = pd.DataFrame(
|
||||
{"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}
|
||||
)
|
||||
|
||||
result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific")
|
||||
for i in range(1, 5)
|
||||
]
|
||||
df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = pd.Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(
|
||||
lambda x: tuple(x),
|
||||
pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
||||
),
|
||||
(
|
||||
lambda x: list(x),
|
||||
pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = pd.DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
(lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = pd.DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = pd.DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = pd.Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = pd.DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}
|
||||
)
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = pd.DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = pd.DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=pd.Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,104 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, MultiIndex
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mframe():
|
||||
index = MultiIndex(
|
||||
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ts():
|
||||
return tm.makeTimeSeries()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsd():
|
||||
return tm.getTimeSeriesData()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe(tsd):
|
||||
return DataFrame(tsd)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_mixed_floats():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.array(np.random.randn(8), dtype="float32"),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def three_group():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.randn(11),
|
||||
"E": np.random.randn(11),
|
||||
"F": np.random.randn(11),
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,659 @@
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, bdate_range
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
def test_apply_issues():
|
||||
# GH 5788
|
||||
|
||||
s = """2011.05.16,00:00,1.40893
|
||||
2011.05.16,01:00,1.40760
|
||||
2011.05.16,02:00,1.40750
|
||||
2011.05.16,03:00,1.40649
|
||||
2011.05.17,02:00,1.40893
|
||||
2011.05.17,03:00,1.40760
|
||||
2011.05.17,04:00,1.40750
|
||||
2011.05.17,05:00,1.40649
|
||||
2011.05.18,02:00,1.40893
|
||||
2011.05.18,03:00,1.40760
|
||||
2011.05.18,04:00,1.40750
|
||||
2011.05.18,05:00,1.40649"""
|
||||
|
||||
df = pd.read_csv(
|
||||
StringIO(s),
|
||||
header=None,
|
||||
names=["date", "time", "value"],
|
||||
parse_dates=[["date", "time"]],
|
||||
)
|
||||
df = df.set_index("date_time")
|
||||
|
||||
expected = df.groupby(df.index.date).idxmax()
|
||||
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 5789
|
||||
# don't auto coerce dates
|
||||
df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"])
|
||||
exp_idx = pd.Index(
|
||||
["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
|
||||
)
|
||||
expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
|
||||
result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_trivial():
|
||||
# GH 20066
|
||||
# trivial apply: ignore input and return a constant dataframe.
|
||||
df = pd.DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"])
|
||||
result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(
|
||||
lambda x: df.iloc[1:]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="GH#20066; function passed into apply "
|
||||
"returns a DataFrame with the same index "
|
||||
"as the one to create GroupBy object."
|
||||
)
|
||||
def test_apply_trivial_fail():
|
||||
# GH 20066
|
||||
# trivial apply fails if the constant dataframe has the same index
|
||||
# with the one used to create GroupBy object.
|
||||
df = pd.DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
expected = pd.concat([df, df], axis=1, keys=["float64", "object"])
|
||||
result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(lambda x: df)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fast_apply():
|
||||
# make sure that fast apply is correctly called
|
||||
# rather than raising any kind of error
|
||||
# otherwise the python path will be callsed
|
||||
# which slows things down
|
||||
N = 1000
|
||||
labels = np.random.randint(0, 2000, size=N)
|
||||
labels2 = np.random.randint(0, 3, size=N)
|
||||
df = DataFrame(
|
||||
{
|
||||
"key": labels,
|
||||
"key2": labels2,
|
||||
"value1": np.random.randn(N),
|
||||
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
|
||||
}
|
||||
)
|
||||
|
||||
def f(g):
|
||||
return 1
|
||||
|
||||
g = df.groupby(["key", "key2"])
|
||||
|
||||
grouper = g.grouper
|
||||
|
||||
splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
|
||||
group_keys = grouper._get_group_keys()
|
||||
|
||||
values, mutated = splitter.fast_apply(f, group_keys)
|
||||
|
||||
assert not mutated
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df, group_names",
|
||||
[
|
||||
(DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]),
|
||||
(DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]),
|
||||
(DataFrame({"a": [1]}), [1]),
|
||||
(DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]),
|
||||
(DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]),
|
||||
(
|
||||
DataFrame(
|
||||
{
|
||||
"a": list("aaabbbcccc"),
|
||||
"B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
|
||||
"C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8],
|
||||
}
|
||||
),
|
||||
["a", "b", "c"],
|
||||
),
|
||||
(DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]),
|
||||
],
|
||||
ids=[
|
||||
"GH2936",
|
||||
"GH7739 & GH10519",
|
||||
"GH10519",
|
||||
"GH2656",
|
||||
"GH12155",
|
||||
"GH20084",
|
||||
"GH21417",
|
||||
],
|
||||
)
|
||||
def test_group_apply_once_per_group(df, group_names):
|
||||
# GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
|
||||
|
||||
# This test should ensure that a function is only evaluated
|
||||
# once per group. Previously the function has been evaluated twice
|
||||
# on the first group to check if the Cython index slider is safe to use
|
||||
# This test ensures that the side effect (append to list) is only triggered
|
||||
# once per group
|
||||
|
||||
names = []
|
||||
# cannot parameterize over the functions since they need external
|
||||
# `names` to detect side effects
|
||||
|
||||
def f_copy(group):
|
||||
# this takes the fast apply path
|
||||
names.append(group.name)
|
||||
return group.copy()
|
||||
|
||||
def f_nocopy(group):
|
||||
# this takes the slow apply path
|
||||
names.append(group.name)
|
||||
return group
|
||||
|
||||
def f_scalar(group):
|
||||
# GH7739, GH2656
|
||||
names.append(group.name)
|
||||
return 0
|
||||
|
||||
def f_none(group):
|
||||
# GH10519, GH12155, GH21417
|
||||
names.append(group.name)
|
||||
return None
|
||||
|
||||
def f_constant_df(group):
|
||||
# GH2936, GH20084
|
||||
names.append(group.name)
|
||||
return DataFrame({"a": [1], "b": [1]})
|
||||
|
||||
for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
|
||||
del names[:]
|
||||
|
||||
df.groupby("a").apply(func)
|
||||
assert names == group_names
|
||||
|
||||
|
||||
def test_apply_with_mixed_dtype():
|
||||
# GH3480, apply with mixed dtype on axis=1 breaks in 0.11
|
||||
df = DataFrame(
|
||||
{
|
||||
"foo1": np.random.randn(6),
|
||||
"foo2": ["one", "two", "two", "three", "one", "two"],
|
||||
}
|
||||
)
|
||||
result = df.apply(lambda x: x, axis=1).dtypes
|
||||
expected = df.dtypes
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# GH 3610 incorrect dtype conversion with as_index=False
|
||||
df = DataFrame({"c1": [1, 2, 6, 6, 8]})
|
||||
df["c2"] = df.c1 / 2.0
|
||||
result1 = df.groupby("c2").mean().reset_index().c2
|
||||
result2 = df.groupby("c2", as_index=False).mean().c2
|
||||
tm.assert_series_equal(result1, result2)
|
||||
|
||||
|
||||
def test_groupby_as_index_apply(df):
|
||||
# GH #4648 and #3417
|
||||
df = DataFrame(
|
||||
{
|
||||
"item_id": ["b", "b", "a", "c", "a", "b"],
|
||||
"user_id": [1, 2, 1, 1, 3, 1],
|
||||
"time": range(6),
|
||||
}
|
||||
)
|
||||
|
||||
g_as = df.groupby("user_id", as_index=True)
|
||||
g_not_as = df.groupby("user_id", as_index=False)
|
||||
|
||||
res_as = g_as.head(2).index
|
||||
res_not_as = g_not_as.head(2).index
|
||||
exp = Index([0, 1, 2, 4])
|
||||
tm.assert_index_equal(res_as, exp)
|
||||
tm.assert_index_equal(res_not_as, exp)
|
||||
|
||||
res_as_apply = g_as.apply(lambda x: x.head(2)).index
|
||||
res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
|
||||
|
||||
# apply doesn't maintain the original ordering
|
||||
# changed in GH5610 as the as_index=False returns a MI here
|
||||
exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)])
|
||||
tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
|
||||
exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None])
|
||||
|
||||
tm.assert_index_equal(res_as_apply, exp_as_apply)
|
||||
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
|
||||
|
||||
ind = Index(list("abcde"))
|
||||
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
|
||||
res = df.groupby(0, as_index=False).apply(lambda x: x).index
|
||||
tm.assert_index_equal(res, ind)
|
||||
|
||||
|
||||
def test_apply_concat_preserve_names(three_group):
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
|
||||
def desc(group):
|
||||
result = group.describe()
|
||||
result.index.name = "stat"
|
||||
return result
|
||||
|
||||
def desc2(group):
|
||||
result = group.describe()
|
||||
result.index.name = "stat"
|
||||
result = result[: len(group)]
|
||||
# weirdo
|
||||
return result
|
||||
|
||||
def desc3(group):
|
||||
result = group.describe()
|
||||
|
||||
# names are different
|
||||
result.index.name = "stat_{:d}".format(len(group))
|
||||
|
||||
result = result[: len(group)]
|
||||
# weirdo
|
||||
return result
|
||||
|
||||
result = grouped.apply(desc)
|
||||
assert result.index.names == ("A", "B", "stat")
|
||||
|
||||
result2 = grouped.apply(desc2)
|
||||
assert result2.index.names == ("A", "B", "stat")
|
||||
|
||||
result3 = grouped.apply(desc3)
|
||||
assert result3.index.names == ("A", "B", None)
|
||||
|
||||
|
||||
def test_apply_series_to_frame():
|
||||
def f(piece):
|
||||
with np.errstate(invalid="ignore"):
|
||||
logged = np.log(piece)
|
||||
return DataFrame(
|
||||
{"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
|
||||
)
|
||||
|
||||
dr = bdate_range("1/1/2000", periods=100)
|
||||
ts = Series(np.random.randn(100), index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(f)
|
||||
|
||||
assert isinstance(result, DataFrame)
|
||||
tm.assert_index_equal(result.index, ts.index)
|
||||
|
||||
|
||||
def test_apply_series_yield_constant(df):
|
||||
result = df.groupby(["A", "B"])["C"].apply(len)
|
||||
assert result.index.names[:2] == ("A", "B")
|
||||
|
||||
|
||||
def test_apply_frame_yield_constant(df):
|
||||
# GH13568
|
||||
result = df.groupby(["A", "B"]).apply(len)
|
||||
assert isinstance(result, Series)
|
||||
assert result.name is None
|
||||
|
||||
result = df.groupby(["A", "B"])[["C", "D"]].apply(len)
|
||||
assert isinstance(result, Series)
|
||||
assert result.name is None
|
||||
|
||||
|
||||
def test_apply_frame_to_series(df):
|
||||
grouped = df.groupby(["A", "B"])
|
||||
result = grouped.apply(len)
|
||||
expected = grouped.count()["C"]
|
||||
tm.assert_index_equal(result.index, expected.index)
|
||||
tm.assert_numpy_array_equal(result.values, expected.values)
|
||||
|
||||
|
||||
def test_apply_frame_concat_series():
|
||||
def trans(group):
|
||||
return group.groupby("B")["C"].sum().sort_values()[:2]
|
||||
|
||||
def trans2(group):
|
||||
grouped = group.groupby(df.reindex(group.index)["B"])
|
||||
return grouped.sum().sort_values()[:2]
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.random.randint(0, 5, 1000),
|
||||
"B": np.random.randint(0, 5, 1000),
|
||||
"C": np.random.randn(1000),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("A").apply(trans)
|
||||
exp = df.groupby("A")["C"].apply(trans2)
|
||||
tm.assert_series_equal(result, exp, check_names=False)
|
||||
assert result.name == "C"
|
||||
|
||||
|
||||
def test_apply_transform(ts):
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(lambda x: x * 2)
|
||||
expected = grouped.transform(lambda x: x * 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_multikey_corner(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
def f(group):
|
||||
return group.sort_values("A")[-5:]
|
||||
|
||||
result = grouped.apply(f)
|
||||
for key, group in grouped:
|
||||
tm.assert_frame_equal(result.loc[key], f(group))
|
||||
|
||||
|
||||
def test_apply_chunk_view():
|
||||
# Low level tinkering could be unsafe, make sure not
|
||||
df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
|
||||
|
||||
result = df.groupby("key", group_keys=False).apply(lambda x: x[:2])
|
||||
expected = df.take([0, 1, 3, 4, 6, 7])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_no_name_column_conflict():
|
||||
df = DataFrame(
|
||||
{
|
||||
"name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
|
||||
"name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
|
||||
"value": range(9, -1, -1),
|
||||
}
|
||||
)
|
||||
|
||||
# it works! #2605
|
||||
grouped = df.groupby(["name", "name2"])
|
||||
grouped.apply(lambda x: x.sort_values("value", inplace=True))
|
||||
|
||||
|
||||
def test_apply_typecast_fail():
|
||||
df = DataFrame(
|
||||
{
|
||||
"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
|
||||
"c": np.tile(["a", "b", "c"], 2),
|
||||
"v": np.arange(1.0, 7.0),
|
||||
}
|
||||
)
|
||||
|
||||
def f(group):
|
||||
v = group["v"]
|
||||
group["v2"] = (v - v.min()) / (v.max() - v.min())
|
||||
return group
|
||||
|
||||
result = df.groupby("d").apply(f)
|
||||
|
||||
expected = df.copy()
|
||||
expected["v2"] = np.tile([0.0, 0.5, 1], 2)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_multiindex_fail():
|
||||
index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
|
||||
df = DataFrame(
|
||||
{
|
||||
"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
|
||||
"c": np.tile(["a", "b", "c"], 2),
|
||||
"v": np.arange(1.0, 7.0),
|
||||
},
|
||||
index=index,
|
||||
)
|
||||
|
||||
def f(group):
|
||||
v = group["v"]
|
||||
group["v2"] = (v - v.min()) / (v.max() - v.min())
|
||||
return group
|
||||
|
||||
result = df.groupby("d").apply(f)
|
||||
|
||||
expected = df.copy()
|
||||
expected["v2"] = np.tile([0.0, 0.5, 1], 2)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_corner(tsframe):
|
||||
result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
|
||||
expected = tsframe * 2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_without_copy():
|
||||
# GH 5545
|
||||
# returning a non-copy in an applied function fails
|
||||
|
||||
data = DataFrame(
|
||||
{
|
||||
"id_field": [100, 100, 200, 300],
|
||||
"category": ["a", "b", "c", "c"],
|
||||
"value": [1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
|
||||
def filt1(x):
|
||||
if x.shape[0] == 1:
|
||||
return x.copy()
|
||||
else:
|
||||
return x[x.category == "c"]
|
||||
|
||||
def filt2(x):
|
||||
if x.shape[0] == 1:
|
||||
return x
|
||||
else:
|
||||
return x[x.category == "c"]
|
||||
|
||||
expected = data.groupby("id_field").apply(filt1)
|
||||
result = data.groupby("id_field").apply(filt2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_corner_cases():
|
||||
# #535, can't use sliding iterator
|
||||
|
||||
N = 1000
|
||||
labels = np.random.randint(0, 100, size=N)
|
||||
df = DataFrame(
|
||||
{
|
||||
"key": labels,
|
||||
"value1": np.random.randn(N),
|
||||
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby("key")
|
||||
|
||||
def f(g):
|
||||
g["value3"] = g["value1"] * 2
|
||||
return g
|
||||
|
||||
result = grouped.apply(f)
|
||||
assert "value3" in result
|
||||
|
||||
|
||||
def test_apply_numeric_coercion_when_datetime():
|
||||
# In the past, group-by/apply operations have been over-eager
|
||||
# in converting dtypes to numeric, in the presence of datetime
|
||||
# columns. Various GH issues were filed, the reproductions
|
||||
# for which are here.
|
||||
|
||||
# GH 15670
|
||||
df = pd.DataFrame(
|
||||
{"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]}
|
||||
)
|
||||
expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
|
||||
df.Date = pd.to_datetime(df.Date)
|
||||
result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(result["Str"], expected["Str"])
|
||||
|
||||
# GH 15421
|
||||
df = pd.DataFrame(
|
||||
{"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
|
||||
)
|
||||
|
||||
def get_B(g):
|
||||
return g.iloc[0][["B"]]
|
||||
|
||||
result = df.groupby("A").apply(get_B)["B"]
|
||||
expected = df.B
|
||||
expected.index = df.A
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# GH 14423
|
||||
def predictions(tool):
|
||||
out = pd.Series(index=["p1", "p2", "useTime"], dtype=object)
|
||||
if "step1" in list(tool.State):
|
||||
out["p1"] = str(tool[tool.State == "step1"].Machine.values[0])
|
||||
if "step2" in list(tool.State):
|
||||
out["p2"] = str(tool[tool.State == "step2"].Machine.values[0])
|
||||
out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0])
|
||||
return out
|
||||
|
||||
df1 = pd.DataFrame(
|
||||
{
|
||||
"Key": ["B", "B", "A", "A"],
|
||||
"State": ["step1", "step2", "step1", "step2"],
|
||||
"oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"],
|
||||
"Machine": ["23", "36L", "36R", "36R"],
|
||||
}
|
||||
)
|
||||
df2 = df1.copy()
|
||||
df2.oTime = pd.to_datetime(df2.oTime)
|
||||
expected = df1.groupby("Key").apply(predictions).p1
|
||||
result = df2.groupby("Key").apply(predictions).p1
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_time_field_bug():
|
||||
# Test a fix for the following error related to GH issue 11324 When
|
||||
# non-key fields in a group-by dataframe contained time-based fields
|
||||
# that were not returned by the apply function, an exception would be
|
||||
# raised.
|
||||
|
||||
df = pd.DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]})
|
||||
|
||||
def func_with_no_date(batch):
|
||||
return pd.Series({"c": 2})
|
||||
|
||||
def func_with_date(batch):
|
||||
return pd.Series({"b": datetime(2015, 1, 1), "c": 2})
|
||||
|
||||
dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
|
||||
dfg_no_conversion_expected = pd.DataFrame({"c": 2}, index=[1])
|
||||
dfg_no_conversion_expected.index.name = "a"
|
||||
|
||||
dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
|
||||
dfg_conversion_expected = pd.DataFrame(
|
||||
{"b": datetime(2015, 1, 1), "c": 2}, index=[1]
|
||||
)
|
||||
dfg_conversion_expected.index.name = "a"
|
||||
|
||||
tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
|
||||
tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
|
||||
|
||||
|
||||
def test_gb_apply_list_of_unequal_len_arrays():
|
||||
|
||||
# GH1738
|
||||
df = DataFrame(
|
||||
{
|
||||
"group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"],
|
||||
"group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"],
|
||||
"weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
|
||||
"value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3],
|
||||
}
|
||||
)
|
||||
df = df.set_index(["group1", "group2"])
|
||||
df_grouped = df.groupby(level=["group1", "group2"], sort=True)
|
||||
|
||||
def noddy(value, weight):
|
||||
out = np.array(value * weight).repeat(3)
|
||||
return out
|
||||
|
||||
# the kernel function returns arrays of unequal length
|
||||
# pandas sniffs the first one, sees it's an array and not
|
||||
# a list, and assumed the rest are of equal length
|
||||
# and so tries a vstack
|
||||
|
||||
# don't die
|
||||
df_grouped.apply(lambda x: noddy(x.value, x.weight))
|
||||
|
||||
|
||||
def test_groupby_apply_all_none():
|
||||
# Tests to make sure no errors if apply function returns all None
|
||||
# values. Issue 9684.
|
||||
test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]})
|
||||
|
||||
def test_func(x):
|
||||
pass
|
||||
|
||||
result = test_df.groupby("groups").apply(test_func)
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_apply_none_first():
|
||||
# GH 12824. Tests if apply returns None first.
|
||||
test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
|
||||
test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
|
||||
|
||||
def test_func(x):
|
||||
if x.shape[0] < 2:
|
||||
return None
|
||||
return x.iloc[[0, -1]]
|
||||
|
||||
result1 = test_df1.groupby("groups").apply(test_func)
|
||||
result2 = test_df2.groupby("groups").apply(test_func)
|
||||
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
|
||||
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
|
||||
expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
|
||||
expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
|
||||
tm.assert_frame_equal(result1, expected1)
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
def test_groupby_apply_return_empty_chunk():
|
||||
# GH 22221: apply filter which returns some empty groups
|
||||
df = pd.DataFrame(dict(value=[0, 1], group=["filled", "empty"]))
|
||||
groups = df.groupby("group")
|
||||
result = groups.apply(lambda group: group[group.value != 1]["value"])
|
||||
expected = pd.Series(
|
||||
[0],
|
||||
name="value",
|
||||
index=MultiIndex.from_product(
|
||||
[["empty", "filled"], [0]], names=["group", None]
|
||||
).drop("empty"),
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_apply_with_mixed_types():
|
||||
# gh-20949
|
||||
df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
|
||||
g = df.groupby("A")
|
||||
|
||||
result = g.transform(lambda x: x / x.sum())
|
||||
expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = g.apply(lambda x: x / x.sum())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,152 @@
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
from pandas._libs import groupby, lib, reduction
|
||||
|
||||
from pandas.core.dtypes.common import ensure_int64
|
||||
|
||||
from pandas import Index, isna
|
||||
from pandas.core.groupby.ops import generate_bins_generic
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
|
||||
def test_series_grouper():
|
||||
from pandas import Series
|
||||
|
||||
obj = Series(np.random.randn(10))
|
||||
dummy = obj[:0]
|
||||
|
||||
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)
|
||||
|
||||
grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy)
|
||||
result, counts = grouper.get_result()
|
||||
|
||||
expected = np.array([obj[3:6].mean(), obj[6:].mean()])
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
exp_counts = np.array([3, 4], dtype=np.int64)
|
||||
assert_almost_equal(counts, exp_counts)
|
||||
|
||||
|
||||
def test_series_bin_grouper():
|
||||
from pandas import Series
|
||||
|
||||
obj = Series(np.random.randn(10))
|
||||
dummy = obj[:0]
|
||||
|
||||
bins = np.array([3, 6])
|
||||
|
||||
grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy)
|
||||
result, counts = grouper.get_result()
|
||||
|
||||
expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
exp_counts = np.array([3, 3, 4], dtype=np.int64)
|
||||
assert_almost_equal(counts, exp_counts)
|
||||
|
||||
|
||||
class TestBinGroupers:
|
||||
def setup_method(self, method):
|
||||
self.obj = np.random.randn(10, 1)
|
||||
self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64)
|
||||
self.bins = np.array([3, 6], dtype=np.int64)
|
||||
|
||||
def test_generate_bins(self):
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
binner = np.array([0, 3, 6, 9], dtype=np.int64)
|
||||
|
||||
for func in [lib.generate_bins_dt64, generate_bins_generic]:
|
||||
bins = func(values, binner, closed="left")
|
||||
assert (bins == np.array([2, 5, 6])).all()
|
||||
|
||||
bins = func(values, binner, closed="right")
|
||||
assert (bins == np.array([3, 6, 6])).all()
|
||||
|
||||
for func in [lib.generate_bins_dt64, generate_bins_generic]:
|
||||
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
|
||||
binner = np.array([0, 3, 6], dtype=np.int64)
|
||||
|
||||
bins = func(values, binner, closed="right")
|
||||
assert (bins == np.array([3, 6])).all()
|
||||
|
||||
msg = "Invalid length for values or for binner"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values, [], "right")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values[:0], binner, "right")
|
||||
|
||||
msg = "Values falls before first bin"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values, [4], "right")
|
||||
msg = "Values falls after last bin"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
generate_bins_generic(values, [-3, -1], "right")
|
||||
|
||||
|
||||
def test_group_ohlc():
|
||||
def _check(dtype):
|
||||
obj = np.array(np.random.randn(20), dtype=dtype)
|
||||
|
||||
bins = np.array([6, 12, 20])
|
||||
out = np.zeros((3, 4), dtype)
|
||||
counts = np.zeros(len(out), dtype=np.int64)
|
||||
labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
|
||||
|
||||
func = getattr(groupby, "group_ohlc_{dtype}".format(dtype=dtype))
|
||||
func(out, counts, obj[:, None], labels)
|
||||
|
||||
def _ohlc(group):
|
||||
if isna(group).all():
|
||||
return np.repeat(nan, 4)
|
||||
return [group[0], group.max(), group.min(), group[-1]]
|
||||
|
||||
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
|
||||
|
||||
assert_almost_equal(out, expected)
|
||||
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
|
||||
|
||||
obj[:6] = nan
|
||||
func(out, counts, obj[:, None], labels)
|
||||
expected[0] = nan
|
||||
assert_almost_equal(out, expected)
|
||||
|
||||
_check("float32")
|
||||
_check("float64")
|
||||
|
||||
|
||||
class TestMoments:
|
||||
pass
|
||||
|
||||
|
||||
class TestReducer:
|
||||
def test_int_index(self):
|
||||
from pandas.core.series import Series
|
||||
|
||||
arr = np.random.randn(100, 4)
|
||||
result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4)))
|
||||
expected = arr.sum(0)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
result = reduction.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100)))
|
||||
expected = arr.sum(1)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
dummy = Series(0.0, index=np.arange(100))
|
||||
result = reduction.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4)))
|
||||
expected = arr.sum(0)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
dummy = Series(0.0, index=np.arange(4))
|
||||
result = reduction.reduce(
|
||||
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
|
||||
)
|
||||
expected = arr.sum(1)
|
||||
assert_almost_equal(result, expected)
|
||||
|
||||
result = reduction.reduce(
|
||||
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
|
||||
)
|
||||
assert_almost_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,225 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestCounting:
|
||||
def test_cumcount(self):
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3])
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series().groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
assert_series_equal(e, ge.cumcount())
|
||||
assert_series_equal(e, se.cumcount())
|
||||
|
||||
def test_cumcount_dupe_index(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=mi)
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_cumcount_groupby_not_col(self):
|
||||
df = DataFrame(
|
||||
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
|
||||
)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.cumcount())
|
||||
assert_series_equal(expected, sg.cumcount())
|
||||
|
||||
def test_ngroup(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0])
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_distinct(self):
|
||||
df = DataFrame({"A": list("abcde")})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series(range(5), dtype="int64")
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_one_group(self):
|
||||
df = DataFrame({"A": [0] * 5})
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_empty(self):
|
||||
ge = DataFrame().groupby(level=0)
|
||||
se = Series().groupby(level=0)
|
||||
|
||||
# edge case, as this is usually considered float
|
||||
e = Series(dtype="int64")
|
||||
|
||||
assert_series_equal(e, ge.ngroup())
|
||||
assert_series_equal(e, se.ngroup())
|
||||
|
||||
def test_ngroup_series_matches_frame(self):
|
||||
df = DataFrame({"A": list("aaaba")})
|
||||
s = Series(list("aaaba"))
|
||||
|
||||
assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
|
||||
|
||||
def test_ngroup_dupe_index(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_mi(self):
|
||||
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
|
||||
df = DataFrame({"A": list("aaaba")}, index=mi)
|
||||
g = df.groupby("A")
|
||||
sg = g.A
|
||||
expected = Series([0, 0, 0, 1, 0], index=mi)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_groupby_not_col(self):
|
||||
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
|
||||
g = df.groupby([0, 0, 0, 1, 0])
|
||||
sg = g.A
|
||||
|
||||
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
|
||||
|
||||
assert_series_equal(expected, g.ngroup())
|
||||
assert_series_equal(expected, sg.ngroup())
|
||||
|
||||
def test_ngroup_descending(self):
|
||||
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
|
||||
g = df.groupby(["A"])
|
||||
|
||||
ascending = Series([0, 0, 1, 0, 1])
|
||||
descending = Series([1, 1, 0, 1, 0])
|
||||
|
||||
assert_series_equal(descending, (g.ngroups - 1) - ascending)
|
||||
assert_series_equal(ascending, g.ngroup(ascending=True))
|
||||
assert_series_equal(descending, g.ngroup(ascending=False))
|
||||
|
||||
def test_ngroup_matches_cumcount(self):
|
||||
# verify one manually-worked out case works
|
||||
df = DataFrame(
|
||||
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
|
||||
columns=["A", "X"],
|
||||
)
|
||||
g = df.groupby(["A", "X"])
|
||||
g_ngroup = g.ngroup()
|
||||
g_cumcount = g.cumcount()
|
||||
expected_ngroup = Series([0, 1, 2, 0, 3])
|
||||
expected_cumcount = Series([0, 0, 0, 1, 0])
|
||||
|
||||
assert_series_equal(g_ngroup, expected_ngroup)
|
||||
assert_series_equal(g_cumcount, expected_cumcount)
|
||||
|
||||
def test_ngroup_cumcount_pair(self):
|
||||
# brute force comparison for all small series
|
||||
for p in product(range(3), repeat=4):
|
||||
df = DataFrame({"a": p})
|
||||
g = df.groupby(["a"])
|
||||
|
||||
order = sorted(set(p))
|
||||
ngroupd = [order.index(val) for val in p]
|
||||
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
|
||||
|
||||
assert_series_equal(g.ngroup(), Series(ngroupd))
|
||||
assert_series_equal(g.cumcount(), Series(cumcounted))
|
||||
|
||||
def test_ngroup_respects_groupby_order(self):
|
||||
np.random.seed(0)
|
||||
df = DataFrame({"a": np.random.choice(list("abcdef"), 100)})
|
||||
for sort_flag in (False, True):
|
||||
g = df.groupby(["a"], sort=sort_flag)
|
||||
df["group_id"] = -1
|
||||
df["group_index"] = -1
|
||||
|
||||
for i, (_, group) in enumerate(g):
|
||||
df.loc[group.index, "group_id"] = i
|
||||
for j, ind in enumerate(group.index):
|
||||
df.loc[ind, "group_index"] = j
|
||||
|
||||
assert_series_equal(Series(df["group_id"].values), g.ngroup())
|
||||
assert_series_equal(Series(df["group_index"].values), g.cumcount())
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"datetimelike",
|
||||
[
|
||||
[
|
||||
Timestamp("2016-05-{i:02d} 20:09:25+00:00".format(i=i))
|
||||
for i in range(1, 4)
|
||||
],
|
||||
[Timestamp("2016-05-{i:02d} 20:09:25".format(i=i)) for i in range(1, 4)],
|
||||
[Timedelta(x, unit="h") for x in range(1, 4)],
|
||||
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
|
||||
],
|
||||
)
|
||||
def test_count_with_datetimelike(self, datetimelike):
|
||||
# test for #13393, where DataframeGroupBy.count() fails
|
||||
# when counting a datetimelike column.
|
||||
|
||||
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
|
||||
res = df.groupby("x").count()
|
||||
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
|
||||
expected.index.name = "x"
|
||||
assert_frame_equal(expected, res)
|
||||
|
||||
def test_count_with_only_nans_in_first_group(self):
|
||||
# GH21956
|
||||
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
|
||||
result = df.groupby(["A", "B"]).C.count()
|
||||
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
|
||||
expected = Series([], index=mi, dtype=np.int64, name="C")
|
||||
assert_series_equal(result, expected, check_index_type=False)
|
||||
@@ -0,0 +1,597 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, Timestamp
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_filter_series():
|
||||
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(s.index),
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(s.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_single_column_df():
|
||||
df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
|
||||
expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
|
||||
expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
|
||||
grouper = df[0].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
|
||||
# Test dropna=False.
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() < 10, dropna=False),
|
||||
expected_odd.reindex(df.index),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x.mean() > 10, dropna=False),
|
||||
expected_even.reindex(df.index),
|
||||
)
|
||||
|
||||
|
||||
def test_filter_multi_column_df():
|
||||
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = pd.DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
|
||||
tm.assert_frame_equal(
|
||||
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
|
||||
)
|
||||
|
||||
|
||||
def test_filter_mixed_df():
|
||||
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
expected = pd.DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
|
||||
|
||||
|
||||
def test_filter_out_all_groups():
|
||||
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
|
||||
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
|
||||
|
||||
|
||||
def test_filter_out_no_groups():
|
||||
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x.mean() > 0)
|
||||
tm.assert_series_equal(filtered, s)
|
||||
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
|
||||
grouper = df["A"].apply(lambda x: x % 2)
|
||||
grouped = df.groupby(grouper)
|
||||
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
|
||||
tm.assert_frame_equal(filtered, df)
|
||||
|
||||
|
||||
def test_filter_out_all_groups_in_df():
|
||||
# GH12768
|
||||
df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
|
||||
expected = pd.DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
|
||||
res = df.groupby("a")
|
||||
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
|
||||
expected = pd.DataFrame({"a": [], "b": []}, dtype="int64")
|
||||
tm.assert_frame_equal(expected, res)
|
||||
|
||||
|
||||
def test_filter_condition_raises():
|
||||
def raise_if_sum_is_zero(x):
|
||||
if x.sum() == 0:
|
||||
raise ValueError
|
||||
else:
|
||||
return x.sum() > 0
|
||||
|
||||
s = pd.Series([-1, 0, 1, 2])
|
||||
grouper = s.apply(lambda x: x % 2)
|
||||
grouped = s.groupby(grouper)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
grouped.filter(raise_if_sum_is_zero)
|
||||
|
||||
|
||||
def test_filter_with_axis_in_groupby():
|
||||
# issue 11041
|
||||
index = pd.MultiIndex.from_product([range(10), [0, 1]])
|
||||
data = pd.DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
|
||||
result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10)
|
||||
expected = data.iloc[:, 12:20]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_bad_shapes():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby("B")
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: x
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: x == 1
|
||||
msg = "filter function returned a DataFrame, but expected a scalar bool"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
f = lambda x: np.outer(x, x)
|
||||
msg = "can't multiply sequence by non-int of type 'str'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_df.filter(f)
|
||||
msg = "the filter must return a boolean result"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
g_s.filter(f)
|
||||
|
||||
|
||||
def test_filter_nan_is_false():
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
s = df["B"]
|
||||
g_df = df.groupby(df["B"])
|
||||
g_s = s.groupby(s)
|
||||
|
||||
f = lambda x: np.nan
|
||||
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
|
||||
tm.assert_series_equal(g_s.filter(f), s[[]])
|
||||
|
||||
|
||||
def test_filter_against_workaround():
|
||||
np.random.seed(0)
|
||||
# Series of ints
|
||||
s = Series(np.random.randint(0, 100, 1000))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
# Series of floats
|
||||
s = 100 * Series(np.random.random(1000))
|
||||
grouper = s.apply(lambda x: np.round(x, -1))
|
||||
grouped = s.groupby(grouper)
|
||||
f = lambda x: x.mean() > 10
|
||||
old_way = s[grouped.transform(f).astype("bool")]
|
||||
new_way = grouped.filter(f)
|
||||
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
|
||||
|
||||
# Set up DataFrame of ints, floats, strings.
|
||||
from string import ascii_lowercase
|
||||
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 1000
|
||||
random_letters = letters.take(np.random.randint(0, 26, N))
|
||||
df = DataFrame(
|
||||
{
|
||||
"ints": Series(np.random.randint(0, 100, N)),
|
||||
"floats": N / 10 * Series(np.random.random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
|
||||
# Group by ints; filter on floats.
|
||||
grouped = df.groupby("ints")
|
||||
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by floats (rounded); filter on strings.
|
||||
grouper = df.floats.apply(lambda x: np.round(x, -1))
|
||||
grouped = df.groupby(grouper)
|
||||
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
# Group by strings; filter on ints.
|
||||
grouped = df.groupby("letters")
|
||||
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
|
||||
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
|
||||
tm.assert_frame_equal(new_way, old_way)
|
||||
|
||||
|
||||
def test_filter_using_len():
|
||||
# BUG GH4447
|
||||
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
|
||||
grouped = df.groupby("B")
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = DataFrame(
|
||||
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
|
||||
index=np.arange(2, 6),
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Series have always worked properly, but we'll test anyway.
|
||||
s = df["B"]
|
||||
grouped = s.groupby(s)
|
||||
actual = grouped.filter(lambda x: len(x) > 2)
|
||||
expected = Series(4 * ["b"], index=np.arange(2, 6), name="B")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped.filter(lambda x: len(x) > 4)
|
||||
expected = s[[]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_maintains_ordering():
|
||||
# Simple case: index is sequential. #4621
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
|
||||
)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Now index is sequentially decreasing.
|
||||
df.index = np.arange(len(df) - 1, -1, -1)
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Index is shuffled.
|
||||
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
|
||||
df.index = df.index[SHUFFLED]
|
||||
s = df["pid"]
|
||||
grouped = df.groupby("tag")
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
grouped = s.groupby(df["tag"])
|
||||
actual = grouped.filter(lambda x: len(x) > 1)
|
||||
expected = s.iloc[[1, 2, 4, 7]]
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_multiple_timestamp():
|
||||
# GH 10114
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": ["foo", "bar", "foo", "bar", "bar"],
|
||||
"C": Timestamp("20130101"),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["B", "C"])
|
||||
|
||||
result = grouped["A"].filter(lambda x: True)
|
||||
tm.assert_series_equal(df["A"], result)
|
||||
|
||||
result = grouped["A"].transform(len)
|
||||
expected = Series([2, 3, 2, 3, 3], name="A")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped.filter(lambda x: True)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = grouped.transform("sum")
|
||||
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grouped.transform(len)
|
||||
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 1, 1, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_multiple_non_unique_int_index():
|
||||
# GH4620
|
||||
index = [1, 1, 1, 2, 0, 0, 0, 1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_float_index():
|
||||
# GH4620
|
||||
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_timestamp_index():
|
||||
# GH4620
|
||||
t0 = Timestamp("2013-09-30 00:05:00")
|
||||
t1 = Timestamp("2013-10-30 00:05:00")
|
||||
t2 = Timestamp("2013-11-30 00:05:00")
|
||||
index = [t1, t1, t1, t2, t1, t1, t0, t1]
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_and_transform_with_non_unique_string_index():
|
||||
# GH4620
|
||||
index = list("bbbcbbab")
|
||||
df = DataFrame(
|
||||
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
|
||||
index=index,
|
||||
)
|
||||
grouped_df = df.groupby("tag")
|
||||
ser = df["pid"]
|
||||
grouped_ser = ser.groupby(df["tag"])
|
||||
expected_indexes = [1, 2, 4, 7]
|
||||
|
||||
# Filter DataFrame
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1)
|
||||
expected = df.iloc[expected_indexes]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
|
||||
expected = df.copy()
|
||||
expected.iloc[[0, 3, 5, 6]] = np.nan
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# Filter Series
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1)
|
||||
expected = ser.take(expected_indexes)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
|
||||
NA = np.nan
|
||||
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
|
||||
# ^ made manually because this can get confusing!
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform Series
|
||||
actual = grouped_ser.transform(len)
|
||||
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
# Transform (a column from) DataFrameGroupBy
|
||||
actual = grouped_df.pid.transform(len)
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
def test_filter_has_access_to_grouped_cols():
|
||||
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
# previously didn't have access to col A #????
|
||||
filt = g.filter(lambda x: x["A"].sum() == 2)
|
||||
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
|
||||
|
||||
|
||||
def test_filter_enforces_scalarness():
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
["best", "a", "x"],
|
||||
["worst", "b", "y"],
|
||||
["best", "c", "x"],
|
||||
["best", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["worst", "d", "y"],
|
||||
["best", "d", "z"],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("c").filter(lambda g: g["a"] == "best")
|
||||
|
||||
|
||||
def test_filter_non_bool_raises():
|
||||
df = pd.DataFrame(
|
||||
[
|
||||
["best", "a", 1],
|
||||
["worst", "b", 1],
|
||||
["best", "c", 1],
|
||||
["best", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["worst", "d", 1],
|
||||
["best", "d", 1],
|
||||
],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
with pytest.raises(TypeError, match="filter function returned a.*"):
|
||||
df.groupby("a").filter(lambda g: g.c.mean())
|
||||
|
||||
|
||||
def test_filter_dropna_with_empty_groups():
|
||||
# GH 10780
|
||||
data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
|
||||
groupped = data.groupby(level=0)
|
||||
result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
|
||||
expected_false = pd.Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
|
||||
tm.assert_series_equal(result_false, expected_false)
|
||||
|
||||
result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
|
||||
expected_true = pd.Series(index=pd.Index([], dtype=int))
|
||||
tm.assert_series_equal(result_true, expected_true)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,905 @@
|
||||
""" test where we are determining what we are grouping, or getting groups """
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
from pandas.core.groupby.grouper import Grouping
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_almost_equal,
|
||||
assert_frame_equal,
|
||||
assert_series_equal,
|
||||
)
|
||||
|
||||
# selection
|
||||
# --------------------------------
|
||||
|
||||
|
||||
class TestSelection:
|
||||
def test_select_bad_cols(self):
|
||||
df = DataFrame([[1, 2]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
with pytest.raises(KeyError, match="\"Columns not found: 'C'\""):
|
||||
g[["C"]]
|
||||
|
||||
with pytest.raises(KeyError, match="^[^A]+$"):
|
||||
# A should not be referenced as a bad column...
|
||||
# will have to rethink regex if you change message!
|
||||
g[["A", "C"]]
|
||||
|
||||
def test_groupby_duplicated_column_errormsg(self):
|
||||
# GH7511
|
||||
df = DataFrame(
|
||||
columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)]
|
||||
)
|
||||
|
||||
msg = "Grouper for 'A' not 1-dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("A")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(["A", "B"])
|
||||
|
||||
grouped = df.groupby("B")
|
||||
c = grouped.count()
|
||||
assert c.columns.nlevels == 1
|
||||
assert c.columns.size == 3
|
||||
|
||||
def test_column_select_via_attr(self, df):
|
||||
result = df.groupby("A").C.sum()
|
||||
expected = df.groupby("A")["C"].sum()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df["mean"] = 1.5
|
||||
result = df.groupby("A").mean()
|
||||
expected = df.groupby("A").agg(np.mean)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_getitem_list_of_columns(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
"E": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("A")[["C", "D"]].mean()
|
||||
result2 = df.groupby("A")["C", "D"].mean()
|
||||
result3 = df.groupby("A")[df.columns[2:4]].mean()
|
||||
|
||||
expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(result3, expected)
|
||||
|
||||
def test_getitem_numeric_column_names(self):
|
||||
# GH #13731
|
||||
df = DataFrame(
|
||||
{
|
||||
0: list("abcd") * 2,
|
||||
2: np.random.randn(8),
|
||||
4: np.random.randn(8),
|
||||
6: np.random.randn(8),
|
||||
}
|
||||
)
|
||||
result = df.groupby(0)[df.columns[1:3]].mean()
|
||||
result2 = df.groupby(0)[2, 4].mean()
|
||||
result3 = df.groupby(0)[[2, 4]].mean()
|
||||
|
||||
expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(result3, expected)
|
||||
|
||||
|
||||
# grouping
|
||||
# --------------------------------
|
||||
|
||||
|
||||
class TestGrouping:
|
||||
def test_grouper_index_types(self):
|
||||
# related GH5375
|
||||
# groupby misbehaving when using a Floatlike index
|
||||
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
|
||||
for index in [
|
||||
tm.makeFloatIndex,
|
||||
tm.makeStringIndex,
|
||||
tm.makeUnicodeIndex,
|
||||
tm.makeIntIndex,
|
||||
tm.makeDateIndex,
|
||||
tm.makePeriodIndex,
|
||||
]:
|
||||
|
||||
df.index = index(len(df))
|
||||
df.groupby(list("abcde")).apply(lambda x: x)
|
||||
|
||||
df.index = list(reversed(df.index.tolist()))
|
||||
df.groupby(list("abcde")).apply(lambda x: x)
|
||||
|
||||
def test_grouper_multilevel_freq(self):
|
||||
|
||||
# GH 7885
|
||||
# with level and freq specified in a pd.Grouper
|
||||
from datetime import date, timedelta
|
||||
|
||||
d0 = date.today() - timedelta(days=14)
|
||||
dates = date_range(d0, date.today())
|
||||
date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"])
|
||||
df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
|
||||
|
||||
# Check string level
|
||||
expected = (
|
||||
df.reset_index()
|
||||
.groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")])
|
||||
.sum()
|
||||
)
|
||||
# reset index changes columns dtype to object
|
||||
expected.columns = pd.Index([0], dtype="int64")
|
||||
|
||||
result = df.groupby(
|
||||
[pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")]
|
||||
).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Check integer level
|
||||
result = df.groupby(
|
||||
[pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")]
|
||||
).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_creation_bug(self):
|
||||
|
||||
# GH 8795
|
||||
df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
|
||||
g = df.groupby("A")
|
||||
expected = g.sum()
|
||||
|
||||
g = df.groupby(pd.Grouper(key="A"))
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = g.apply(lambda x: x.sum())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
g = df.groupby(pd.Grouper(key="A", axis=0))
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH14334
|
||||
# pd.Grouper(key=...) may be passed in a list
|
||||
df = DataFrame(
|
||||
{"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
# Group by single column
|
||||
expected = df.groupby("A").sum()
|
||||
g = df.groupby([pd.Grouper(key="A")])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Group by two columns
|
||||
# using a combination of strings and Grouper objects
|
||||
expected = df.groupby(["A", "B"]).sum()
|
||||
|
||||
# Group with two Grouper objects
|
||||
g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Group with a string and a Grouper object
|
||||
g = df.groupby(["A", pd.Grouper(key="B")])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Group with a Grouper object and a string
|
||||
g = df.groupby([pd.Grouper(key="A"), "B"])
|
||||
result = g.sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH8866
|
||||
s = Series(
|
||||
np.arange(8, dtype="int64"),
|
||||
index=pd.MultiIndex.from_product(
|
||||
[list("ab"), range(2), date_range("20130101", periods=2)],
|
||||
names=["one", "two", "three"],
|
||||
),
|
||||
)
|
||||
result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
|
||||
expected = Series(
|
||||
[28], index=Index([Timestamp("2013-01-31")], freq="M", name="three")
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# just specifying a level breaks
|
||||
result = s.groupby(pd.Grouper(level="one")).sum()
|
||||
expected = s.groupby(level="one").sum()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_grouper_column_and_index(self):
|
||||
# GH 14327
|
||||
|
||||
# Grouping a multi-index frame by a column and an index level should
|
||||
# be equivalent to resetting the index and grouping by two columns
|
||||
idx = pd.MultiIndex.from_tuples(
|
||||
[("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
|
||||
)
|
||||
idx.names = ["outer", "inner"]
|
||||
df_multi = pd.DataFrame(
|
||||
{"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
|
||||
index=idx,
|
||||
)
|
||||
result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean()
|
||||
expected = df_multi.reset_index().groupby(["B", "inner"]).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Test the reverse grouping order
|
||||
result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean()
|
||||
expected = df_multi.reset_index().groupby(["inner", "B"]).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Grouping a single-index frame by a column and the index should
|
||||
# be equivalent to resetting the index and grouping by two columns
|
||||
df_single = df_multi.reset_index("outer")
|
||||
result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean()
|
||||
expected = df_single.reset_index().groupby(["B", "inner"]).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Test the reverse grouping order
|
||||
result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean()
|
||||
expected = df_single.reset_index().groupby(["inner", "B"]).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_levels_and_columns(self):
|
||||
# GH9344, GH9049
|
||||
idx_names = ["x", "y"]
|
||||
idx = pd.MultiIndex.from_tuples(
|
||||
[(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names
|
||||
)
|
||||
df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
|
||||
|
||||
by_levels = df.groupby(level=idx_names).mean()
|
||||
# reset_index changes columns dtype to object
|
||||
by_columns = df.reset_index().groupby(idx_names).mean()
|
||||
|
||||
tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
|
||||
|
||||
by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
|
||||
tm.assert_frame_equal(by_levels, by_columns)
|
||||
|
||||
def test_groupby_categorical_index_and_columns(self, observed):
|
||||
# GH18432, adapted for GH25871
|
||||
columns = ["A", "B", "A", "B"]
|
||||
categories = ["B", "A"]
|
||||
data = np.array(
|
||||
[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
|
||||
)
|
||||
cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
|
||||
df = DataFrame(data=data, columns=cat_columns)
|
||||
result = df.groupby(axis=1, level=0, observed=observed).sum()
|
||||
expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
|
||||
expected_columns = CategoricalIndex(
|
||||
categories, categories=categories, ordered=True
|
||||
)
|
||||
expected = DataFrame(data=expected_data, columns=expected_columns)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test transposed version
|
||||
df = DataFrame(data.T, index=cat_columns)
|
||||
result = df.groupby(axis=0, level=0, observed=observed).sum()
|
||||
expected = DataFrame(data=expected_data.T, index=expected_columns)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_getting_correct_binner(self):
|
||||
|
||||
# GH 10063
|
||||
# using a non-time-based grouper and a time-based grouper
|
||||
# and specifying levels
|
||||
df = DataFrame(
|
||||
{"A": 1},
|
||||
index=pd.MultiIndex.from_product(
|
||||
[list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
|
||||
),
|
||||
)
|
||||
result = df.groupby(
|
||||
[pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")]
|
||||
).sum()
|
||||
expected = DataFrame(
|
||||
{"A": [31, 28, 21, 31, 28, 21]},
|
||||
index=MultiIndex.from_product(
|
||||
[list("ab"), date_range("20130101", freq="M", periods=3)],
|
||||
names=["one", "two"],
|
||||
),
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_grouper_iter(self, df):
|
||||
assert sorted(df.groupby("A").grouper) == ["bar", "foo"]
|
||||
|
||||
def test_empty_groups(self, df):
|
||||
# see gh-1048
|
||||
with pytest.raises(ValueError, match="No group keys passed!"):
|
||||
df.groupby([])
|
||||
|
||||
def test_groupby_grouper(self, df):
|
||||
grouped = df.groupby("A")
|
||||
|
||||
result = df.groupby(grouped.grouper).mean()
|
||||
expected = grouped.mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_dict_mapping(self):
|
||||
# GH #679
|
||||
from pandas import Series
|
||||
|
||||
s = Series({"T1": 5})
|
||||
result = s.groupby({"T1": "T2"}).agg(sum)
|
||||
expected = s.groupby(["T2"]).agg(sum)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
|
||||
mapping = {"a": 0, "b": 0, "c": 1, "d": 1}
|
||||
|
||||
result = s.groupby(mapping).mean()
|
||||
result2 = s.groupby(mapping).agg(np.mean)
|
||||
expected = s.groupby([0, 0, 1, 1]).mean()
|
||||
expected2 = s.groupby([0, 0, 1, 1]).mean()
|
||||
assert_series_equal(result, expected)
|
||||
assert_series_equal(result, result2)
|
||||
assert_series_equal(result, expected2)
|
||||
|
||||
def test_groupby_grouper_f_sanity_checked(self):
|
||||
dates = date_range("01-Jan-2013", periods=12, freq="MS")
|
||||
ts = Series(np.random.randn(12), index=dates)
|
||||
|
||||
# GH3035
|
||||
# index.map is used to apply grouper to the index
|
||||
# if it fails on the elements, map tries it on the entire index as
|
||||
# a sequence. That can yield invalid results that cause trouble
|
||||
# down the line.
|
||||
# the surprise comes from using key[0:6] rather then str(key)[0:6]
|
||||
# when the elements are Timestamp.
|
||||
# the result is Index[0:6], very confusing.
|
||||
|
||||
msg = r"Grouper result violates len\(labels\) == len\(data\)"
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
ts.groupby(lambda key: key[0:6])
|
||||
|
||||
def test_grouping_error_on_multidim_input(self, df):
|
||||
msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
Grouping(df.index, df[["A", "A"]])
|
||||
|
||||
def test_multiindex_passthru(self):
|
||||
|
||||
# GH 7997
|
||||
# regression from 0.14.1
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
|
||||
|
||||
result = df.groupby(axis=1, level=[0, 1]).first()
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_multiindex_negative_level(self, mframe):
|
||||
# GH 13901
|
||||
result = mframe.groupby(level=-1).sum()
|
||||
expected = mframe.groupby(level="second").sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = mframe.groupby(level=-2).sum()
|
||||
expected = mframe.groupby(level="first").sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = mframe.groupby(level=[-2, -1]).sum()
|
||||
expected = mframe
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = mframe.groupby(level=[-1, "first"]).sum()
|
||||
expected = mframe.groupby(level=["second", "first"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_multifunc_select_col_integer_cols(self, df):
|
||||
df.columns = np.arange(len(df.columns))
|
||||
|
||||
# it works!
|
||||
df.groupby(1, as_index=False)[2].agg({"Q": np.mean})
|
||||
|
||||
def test_multiindex_columns_empty_level(self):
|
||||
lst = [["count", "values"], ["to filter", ""]]
|
||||
midx = MultiIndex.from_tuples(lst)
|
||||
|
||||
df = DataFrame([[1, "A"]], columns=midx)
|
||||
|
||||
grouped = df.groupby("to filter").groups
|
||||
assert grouped["A"] == [0]
|
||||
|
||||
grouped = df.groupby([("to filter", "")]).groups
|
||||
assert grouped["A"] == [0]
|
||||
|
||||
df = DataFrame([[1, "A"], [2, "B"]], columns=midx)
|
||||
|
||||
expected = df.groupby("to filter").groups
|
||||
result = df.groupby([("to filter", "")]).groups
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([[1, "A"], [2, "A"]], columns=midx)
|
||||
|
||||
expected = df.groupby("to filter").groups
|
||||
result = df.groupby([("to filter", "")]).groups
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
def test_groupby_multiindex_tuple(self):
|
||||
# GH 17979
|
||||
df = pd.DataFrame(
|
||||
[[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
|
||||
columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
|
||||
)
|
||||
expected = df.groupby([("b", 1)]).groups
|
||||
result = df.groupby(("b", 1)).groups
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
df2 = pd.DataFrame(
|
||||
df.values,
|
||||
columns=pd.MultiIndex.from_arrays(
|
||||
[["a", "b", "b", "c"], ["d", "d", "e", "e"]]
|
||||
),
|
||||
)
|
||||
expected = df2.groupby([("b", "d")]).groups
|
||||
result = df.groupby(("b", 1)).groups
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
|
||||
expected = df3.groupby([("b", "d")]).groups
|
||||
result = df.groupby(("b", 1)).groups
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_groupby_level(self, sort, mframe, df):
|
||||
# GH 17537
|
||||
frame = mframe
|
||||
deleveled = frame.reset_index()
|
||||
|
||||
result0 = frame.groupby(level=0, sort=sort).sum()
|
||||
result1 = frame.groupby(level=1, sort=sort).sum()
|
||||
|
||||
expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
|
||||
expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()
|
||||
|
||||
expected0.index.name = "first"
|
||||
expected1.index.name = "second"
|
||||
|
||||
assert result0.index.name == "first"
|
||||
assert result1.index.name == "second"
|
||||
|
||||
assert_frame_equal(result0, expected0)
|
||||
assert_frame_equal(result1, expected1)
|
||||
assert result0.index.name == frame.index.names[0]
|
||||
assert result1.index.name == frame.index.names[1]
|
||||
|
||||
# groupby level name
|
||||
result0 = frame.groupby(level="first", sort=sort).sum()
|
||||
result1 = frame.groupby(level="second", sort=sort).sum()
|
||||
assert_frame_equal(result0, expected0)
|
||||
assert_frame_equal(result1, expected1)
|
||||
|
||||
# axis=1
|
||||
|
||||
result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
|
||||
result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
|
||||
assert_frame_equal(result0, expected0.T)
|
||||
assert_frame_equal(result1, expected1.T)
|
||||
|
||||
# raise exception for non-MultiIndex
|
||||
msg = "level > 0 or level < -1 only valid with MultiIndex"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(level=1)
|
||||
|
||||
def test_groupby_level_index_names(self):
|
||||
# GH4014 this used to raise ValueError since 'exp'>1 (in py2)
|
||||
df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
|
||||
"exp"
|
||||
)
|
||||
df.groupby(level="exp")
|
||||
msg = "level name foo is not the name of the index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby(level="foo")
|
||||
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_groupby_level_with_nas(self, sort):
|
||||
# GH 17537
|
||||
index = MultiIndex(
|
||||
levels=[[1, 0], [0, 1, 2, 3]],
|
||||
codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
|
||||
)
|
||||
|
||||
# factorizing doesn't confuse things
|
||||
s = Series(np.arange(8.0), index=index)
|
||||
result = s.groupby(level=0, sort=sort).sum()
|
||||
expected = Series([6.0, 22.0], index=[0, 1])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
index = MultiIndex(
|
||||
levels=[[1, 0], [0, 1, 2, 3]],
|
||||
codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
|
||||
)
|
||||
|
||||
# factorizing doesn't confuse things
|
||||
s = Series(np.arange(8.0), index=index)
|
||||
result = s.groupby(level=0, sort=sort).sum()
|
||||
expected = Series([6.0, 18.0], index=[0.0, 1.0])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_args(self, mframe):
|
||||
# PR8618 and issue 8015
|
||||
frame = mframe
|
||||
|
||||
msg = "You have to supply one of 'by' and 'level'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby()
|
||||
|
||||
msg = "You have to supply one of 'by' and 'level'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby(by=None, level=None)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sort,labels",
|
||||
[
|
||||
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
|
||||
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
|
||||
],
|
||||
)
|
||||
def test_level_preserve_order(self, sort, labels, mframe):
|
||||
# GH 17537
|
||||
grouped = mframe.groupby(level=0, sort=sort)
|
||||
exp_labels = np.array(labels, np.intp)
|
||||
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
|
||||
|
||||
def test_grouping_labels(self, mframe):
|
||||
grouped = mframe.groupby(mframe.index.get_level_values(0))
|
||||
exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
|
||||
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
|
||||
|
||||
def test_list_grouper_with_nat(self):
|
||||
# GH 14715
|
||||
df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")})
|
||||
df.iloc[-1] = pd.NaT
|
||||
grouper = pd.Grouper(key="date", freq="AS")
|
||||
|
||||
# Grouper in a list grouping
|
||||
result = df.groupby([grouper])
|
||||
expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))}
|
||||
tm.assert_dict_equal(result.groups, expected)
|
||||
|
||||
# Test case without a list
|
||||
result = df.groupby(grouper)
|
||||
expected = {pd.Timestamp("2011-01-01"): 365}
|
||||
tm.assert_dict_equal(result.groups, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func,expected",
|
||||
[
|
||||
("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))),
|
||||
("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))),
|
||||
("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))),
|
||||
],
|
||||
)
|
||||
def test_evaluate_with_empty_groups(self, func, expected):
|
||||
# 26208
|
||||
# test transform'ing empty groups
|
||||
# (not testing other agg fns, because they return
|
||||
# different index objects.
|
||||
df = pd.DataFrame({1: [], 2: []})
|
||||
g = df.groupby(1)
|
||||
result = getattr(g[2], func)(lambda x: x)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_empty(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/27190
|
||||
s = pd.Series([], name="name")
|
||||
gr = s.groupby([])
|
||||
|
||||
result = gr.mean()
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
# check group properties
|
||||
assert len(gr.grouper.groupings) == 1
|
||||
tm.assert_numpy_array_equal(
|
||||
gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64"))
|
||||
)
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
gr.grouper.group_info[1], np.array([], dtype=np.dtype("int"))
|
||||
)
|
||||
|
||||
assert gr.grouper.group_info[2] == 0
|
||||
|
||||
# check name
|
||||
assert s.groupby(s).grouper.names == ["name"]
|
||||
|
||||
|
||||
# get_group
|
||||
# --------------------------------
|
||||
|
||||
|
||||
class TestGetGroup:
|
||||
def test_get_group(self):
|
||||
# GH 5267
|
||||
# be datelike friendly
|
||||
df = DataFrame(
|
||||
{
|
||||
"DATE": pd.to_datetime(
|
||||
[
|
||||
"10-Oct-2013",
|
||||
"10-Oct-2013",
|
||||
"10-Oct-2013",
|
||||
"11-Oct-2013",
|
||||
"11-Oct-2013",
|
||||
"11-Oct-2013",
|
||||
]
|
||||
),
|
||||
"label": ["foo", "foo", "bar", "foo", "foo", "bar"],
|
||||
"VAL": [1, 2, 3, 4, 5, 6],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("DATE")
|
||||
key = list(g.groups)[0]
|
||||
result1 = g.get_group(key)
|
||||
result2 = g.get_group(Timestamp(key).to_pydatetime())
|
||||
result3 = g.get_group(str(Timestamp(key)))
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
|
||||
g = df.groupby(["DATE", "label"])
|
||||
|
||||
key = list(g.groups)[0]
|
||||
result1 = g.get_group(key)
|
||||
result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
|
||||
result3 = g.get_group((str(Timestamp(key[0])), key[1]))
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
|
||||
# must pass a same-length tuple with multiple keys
|
||||
msg = "must supply a tuple to get_group with multiple grouping keys"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
g.get_group("foo")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
g.get_group(("foo"))
|
||||
msg = (
|
||||
"must supply a same-length tuple to get_group with multiple"
|
||||
" grouping keys"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
g.get_group(("foo", "bar", "baz"))
|
||||
|
||||
def test_get_group_empty_bins(self, observed):
|
||||
|
||||
d = pd.DataFrame([3, 1, 7, 6])
|
||||
bins = [0, 5, 10, 15]
|
||||
g = d.groupby(pd.cut(d[0], bins), observed=observed)
|
||||
|
||||
# TODO: should prob allow a str of Interval work as well
|
||||
# IOW '(0, 5]'
|
||||
result = g.get_group(pd.Interval(0, 5))
|
||||
expected = DataFrame([3, 1], index=[0, 1])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
msg = r"Interval\(10, 15, closed='right'\)"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
g.get_group(pd.Interval(10, 15))
|
||||
|
||||
def test_get_group_grouped_by_tuple(self):
|
||||
# GH 8121
|
||||
df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T
|
||||
gr = df.groupby("ids")
|
||||
expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2])
|
||||
result = gr.get_group((1,))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"])
|
||||
df = DataFrame({"ids": [(x,) for x in dt]})
|
||||
gr = df.groupby("ids")
|
||||
result = gr.get_group(("2010-01-01",))
|
||||
expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_with_empty(self):
|
||||
index = pd.DatetimeIndex(())
|
||||
data = ()
|
||||
series = pd.Series(data, index)
|
||||
grouper = pd.Grouper(freq="D")
|
||||
grouped = series.groupby(grouper)
|
||||
assert next(iter(grouped), None) is None
|
||||
|
||||
def test_groupby_with_single_column(self):
|
||||
df = pd.DataFrame({"a": list("abssbab")})
|
||||
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
|
||||
# GH 13530
|
||||
exp = pd.DataFrame(index=pd.Index(["a", "b", "s"], name="a"))
|
||||
tm.assert_frame_equal(df.groupby("a").count(), exp)
|
||||
tm.assert_frame_equal(df.groupby("a").sum(), exp)
|
||||
tm.assert_frame_equal(df.groupby("a").nth(1), exp)
|
||||
|
||||
def test_gb_key_len_equal_axis_len(self):
|
||||
# GH16843
|
||||
# test ensures that index and column keys are recognized correctly
|
||||
# when number of keys equals axis length of groupby
|
||||
df = pd.DataFrame(
|
||||
[["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]],
|
||||
columns=["first", "second", "third", "one"],
|
||||
)
|
||||
df = df.set_index(["first", "second"])
|
||||
df = df.groupby(["first", "second", "third"]).size()
|
||||
assert df.loc[("foo", "bar", "B")] == 2
|
||||
assert df.loc[("foo", "baz", "C")] == 1
|
||||
|
||||
|
||||
# groups & iteration
|
||||
# --------------------------------
|
||||
|
||||
|
||||
class TestIteration:
|
||||
def test_groups(self, df):
|
||||
grouped = df.groupby(["A"])
|
||||
groups = grouped.groups
|
||||
assert groups is grouped.groups # caching works
|
||||
|
||||
for k, v in grouped.groups.items():
|
||||
assert (df.loc[v]["A"] == k).all()
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
groups = grouped.groups
|
||||
assert groups is grouped.groups # caching works
|
||||
|
||||
for k, v in grouped.groups.items():
|
||||
assert (df.loc[v]["A"] == k[0]).all()
|
||||
assert (df.loc[v]["B"] == k[1]).all()
|
||||
|
||||
def test_grouping_is_iterable(self, tsframe):
|
||||
# this code path isn't used anywhere else
|
||||
# not sure it's useful
|
||||
grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
|
||||
|
||||
# test it works
|
||||
for g in grouped.grouper.groupings[0]:
|
||||
pass
|
||||
|
||||
def test_multi_iter(self):
|
||||
s = Series(np.arange(6))
|
||||
k1 = np.array(["a", "a", "a", "b", "b", "b"])
|
||||
k2 = np.array(["1", "2", "1", "2", "1", "2"])
|
||||
|
||||
grouped = s.groupby([k1, k2])
|
||||
|
||||
iterated = list(grouped)
|
||||
expected = [
|
||||
("a", "1", s[[0, 2]]),
|
||||
("a", "2", s[[1]]),
|
||||
("b", "1", s[[4]]),
|
||||
("b", "2", s[[3, 5]]),
|
||||
]
|
||||
for i, ((one, two), three) in enumerate(iterated):
|
||||
e1, e2, e3 = expected[i]
|
||||
assert e1 == one
|
||||
assert e2 == two
|
||||
assert_series_equal(three, e3)
|
||||
|
||||
def test_multi_iter_frame(self, three_group):
|
||||
k1 = np.array(["b", "b", "b", "a", "a", "a"])
|
||||
k2 = np.array(["1", "2", "1", "2", "1", "2"])
|
||||
df = DataFrame(
|
||||
{"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2},
|
||||
index=["one", "two", "three", "four", "five", "six"],
|
||||
)
|
||||
|
||||
grouped = df.groupby(["k1", "k2"])
|
||||
|
||||
# things get sorted!
|
||||
iterated = list(grouped)
|
||||
idx = df.index
|
||||
expected = [
|
||||
("a", "1", df.loc[idx[[4]]]),
|
||||
("a", "2", df.loc[idx[[3, 5]]]),
|
||||
("b", "1", df.loc[idx[[0, 2]]]),
|
||||
("b", "2", df.loc[idx[[1]]]),
|
||||
]
|
||||
for i, ((one, two), three) in enumerate(iterated):
|
||||
e1, e2, e3 = expected[i]
|
||||
assert e1 == one
|
||||
assert e2 == two
|
||||
assert_frame_equal(three, e3)
|
||||
|
||||
# don't iterate through groups with no data
|
||||
df["k1"] = np.array(["b", "b", "b", "a", "a", "a"])
|
||||
df["k2"] = np.array(["1", "1", "1", "2", "2", "2"])
|
||||
grouped = df.groupby(["k1", "k2"])
|
||||
groups = {key: gp for key, gp in grouped}
|
||||
assert len(groups) == 2
|
||||
|
||||
# axis = 1
|
||||
three_levels = three_group.groupby(["A", "B", "C"]).mean()
|
||||
grouped = three_levels.T.groupby(axis=1, level=(1, 2))
|
||||
for key, group in grouped:
|
||||
pass
|
||||
|
||||
def test_dictify(self, df):
|
||||
dict(iter(df.groupby("A")))
|
||||
dict(iter(df.groupby(["A", "B"])))
|
||||
dict(iter(df["C"].groupby(df["A"])))
|
||||
dict(iter(df["C"].groupby([df["A"], df["B"]])))
|
||||
dict(iter(df.groupby("A")["C"]))
|
||||
dict(iter(df.groupby(["A", "B"])["C"]))
|
||||
|
||||
def test_groupby_with_small_elem(self):
|
||||
# GH 8542
|
||||
# length=2
|
||||
df = pd.DataFrame(
|
||||
{"event": ["start", "start"], "change": [1234, 5678]},
|
||||
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
|
||||
)
|
||||
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
|
||||
assert len(grouped.groups) == 2
|
||||
assert grouped.ngroups == 2
|
||||
assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups
|
||||
assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups
|
||||
|
||||
res = grouped.get_group((pd.Timestamp("2014-09-30"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[0], :])
|
||||
res = grouped.get_group((pd.Timestamp("2013-10-31"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[1], :])
|
||||
|
||||
df = pd.DataFrame(
|
||||
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
|
||||
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
|
||||
)
|
||||
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
|
||||
assert len(grouped.groups) == 2
|
||||
assert grouped.ngroups == 2
|
||||
assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups
|
||||
assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups
|
||||
|
||||
res = grouped.get_group((pd.Timestamp("2014-09-30"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[0, 2], :])
|
||||
res = grouped.get_group((pd.Timestamp("2013-10-31"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[1], :])
|
||||
|
||||
# length=3
|
||||
df = pd.DataFrame(
|
||||
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
|
||||
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
|
||||
)
|
||||
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
|
||||
assert len(grouped.groups) == 3
|
||||
assert grouped.ngroups == 3
|
||||
assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups
|
||||
assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups
|
||||
assert (pd.Timestamp("2014-08-31"), "start") in grouped.groups
|
||||
|
||||
res = grouped.get_group((pd.Timestamp("2014-09-30"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[0], :])
|
||||
res = grouped.get_group((pd.Timestamp("2013-10-31"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[1], :])
|
||||
res = grouped.get_group((pd.Timestamp("2014-08-31"), "start"))
|
||||
tm.assert_frame_equal(res, df.iloc[[2], :])
|
||||
|
||||
def test_grouping_string_repr(self):
|
||||
# GH 13394
|
||||
mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
|
||||
df = DataFrame([[1, 2, 3]], columns=mi)
|
||||
gr = df.groupby(df[("A", "a")])
|
||||
|
||||
result = gr.grouper.groupings[0].__repr__()
|
||||
expected = "Grouping(('A', 'a'))"
|
||||
assert result == expected
|
||||
@@ -0,0 +1,82 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
|
||||
def frame(request):
|
||||
levels = request.param
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
if levels:
|
||||
df = df.set_index(levels)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def series():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 3, 1, 2, 3],
|
||||
"A": np.arange(6),
|
||||
"B": ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
s = df.set_index(["outer", "inner", "B"])["A"]
|
||||
|
||||
return s
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key_strs,groupers",
|
||||
[
|
||||
("inner", pd.Grouper(level="inner")), # Index name
|
||||
(["inner"], [pd.Grouper(level="inner")]), # List of index name
|
||||
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
|
||||
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string(frame, key_strs, groupers):
|
||||
result = frame.groupby(key_strs).mean()
|
||||
expected = frame.groupby(groupers).mean()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"levels",
|
||||
[
|
||||
"inner",
|
||||
"outer",
|
||||
"B",
|
||||
["inner"],
|
||||
["outer"],
|
||||
["B"],
|
||||
["inner", "outer"],
|
||||
["outer", "inner"],
|
||||
["inner", "outer", "B"],
|
||||
["B", "outer", "inner"],
|
||||
],
|
||||
)
|
||||
def test_grouper_index_level_as_string_series(series, levels):
|
||||
|
||||
# Compute expected result
|
||||
if isinstance(levels, list):
|
||||
groupers = [pd.Grouper(level=lv) for lv in levels]
|
||||
else:
|
||||
groupers = pd.Grouper(level=levels)
|
||||
|
||||
expected = series.groupby(groupers).mean()
|
||||
|
||||
# Compute and check result
|
||||
result = series.groupby(levels).mean()
|
||||
assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,513 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.loc[[2, 3], ["B", "C", "D"]].copy()
|
||||
expected.index = Index(["foo", "bar"], name="A")
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped["B"].first()
|
||||
grouped["B"].last()
|
||||
grouped["B"].nth(0)
|
||||
|
||||
df.loc[df["A"] == "foo", "B"] = np.nan
|
||||
assert isna(grouped["B"].first()["foo"])
|
||||
assert isna(grouped["B"].last()["foo"])
|
||||
assert isna(grouped["B"].nth(0)["foo"])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
result = g.nth(0, dropna="any")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes(df_mixed_floats):
|
||||
|
||||
df = df_mixed_floats.copy()
|
||||
df["E"] = True
|
||||
df["F"] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
assert_frame_equal(nth, expected)
|
||||
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = list(range(10))
|
||||
idx.append(9)
|
||||
s = Series(data=range(11), index=idx, name="IntCol")
|
||||
assert s.dtype == "int64"
|
||||
f = s.groupby(level=0).first()
|
||||
assert f.dtype == "int64"
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
|
||||
assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A"))
|
||||
assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A"))
|
||||
assert_frame_equal(g.nth(2), df.loc[[]].set_index("A"))
|
||||
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A"))
|
||||
assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A"))
|
||||
assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A"))
|
||||
assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]])
|
||||
assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]])
|
||||
assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A"))
|
||||
|
||||
exp = df.set_index("A")
|
||||
assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]])
|
||||
assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]])
|
||||
|
||||
exp["B"] = np.nan
|
||||
assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]])
|
||||
assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]])
|
||||
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame(
|
||||
{
|
||||
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
||||
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
||||
"two": {
|
||||
0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997,
|
||||
},
|
||||
"one": {
|
||||
0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997,
|
||||
},
|
||||
}
|
||||
).set_index(["color", "food"])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64")
|
||||
s = df[1]
|
||||
g = df[0]
|
||||
expected = s.groupby(g).first()
|
||||
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
|
||||
assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = s[g == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
# this is NOT the same as .first (as sorted is default!)
|
||||
# as it keeps the order in the series (and not the group order)
|
||||
# related GH 7287
|
||||
expected = s.groupby(g, sort=False).first()
|
||||
result = s.groupby(g, sort=False).nth(0, dropna="all")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="For a DataFrame groupby"):
|
||||
s.groupby(g, sort=False).nth(0, dropna=True)
|
||||
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.B.nth(0, dropna="all")
|
||||
expected = g.B.first()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
|
||||
assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A"))
|
||||
assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A"))
|
||||
assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A"))
|
||||
assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A"))
|
||||
assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
|
||||
assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
|
||||
assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A"))
|
||||
assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A"))
|
||||
|
||||
business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B")
|
||||
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
[
|
||||
"2014/4/1",
|
||||
"2014/4/4",
|
||||
"2014/4/29",
|
||||
"2014/4/30",
|
||||
"2014/5/1",
|
||||
"2014/5/6",
|
||||
"2014/5/29",
|
||||
"2014/5/30",
|
||||
"2014/6/2",
|
||||
"2014/6/5",
|
||||
"2014/6/27",
|
||||
"2014/6/30",
|
||||
]
|
||||
)
|
||||
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_index(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex, should match .first()
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = grouped.first()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_first, expected_last",
|
||||
[
|
||||
(
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"id": ["A", "B", "A"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
],
|
||||
"foo": [1, 2, 3],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [1, 2],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [3, 2],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby("id", as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ["id", "time", "foo"]
|
||||
assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].first()
|
||||
assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
result = df.groupby("id", as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ["id", "time", "foo"]
|
||||
assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].last()
|
||||
assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, ts, alpha",
|
||||
[
|
||||
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
||||
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
||||
],
|
||||
)
|
||||
def test_first_last_tz_multi_column(method, ts, alpha):
|
||||
# GH 21603
|
||||
category_string = pd.Series(list("abc")).astype("category")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"category_string": category_string,
|
||||
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("group"), method)()
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"category_string": pd.Categorical(
|
||||
[alpha, "c"], dtype=category_string.dtype
|
||||
),
|
||||
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
||||
},
|
||||
index=pd.Index([1, 2], name="group"),
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
}
|
||||
)
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame(
|
||||
{"C": ["dull", "dull", "dull", "dull"]},
|
||||
index=MultiIndex.from_arrays(
|
||||
[["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]],
|
||||
names=["A", "B"],
|
||||
),
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_head_tail():
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g_as = df.groupby("A", as_index=True)
|
||||
g_not_as = df.groupby("A", as_index=False)
|
||||
|
||||
# as_index= False, much easier
|
||||
assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
|
||||
assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
|
||||
|
||||
empty_not_as = DataFrame(
|
||||
columns=df.columns, index=pd.Index([], dtype=df.index.dtype)
|
||||
)
|
||||
empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype)
|
||||
empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype)
|
||||
assert_frame_equal(empty_not_as, g_not_as.head(0))
|
||||
assert_frame_equal(empty_not_as, g_not_as.tail(0))
|
||||
assert_frame_equal(empty_not_as, g_not_as.head(-1))
|
||||
assert_frame_equal(empty_not_as, g_not_as.tail(-1))
|
||||
|
||||
assert_frame_equal(df, g_not_as.head(7)) # contains all
|
||||
assert_frame_equal(df, g_not_as.tail(7))
|
||||
|
||||
# as_index=True, (used to be different)
|
||||
df_as = df
|
||||
|
||||
assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
|
||||
assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
|
||||
|
||||
empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
|
||||
empty_as["A"] = empty_not_as["A"].astype(df.A.dtype)
|
||||
empty_as["B"] = empty_not_as["B"].astype(df.B.dtype)
|
||||
assert_frame_equal(empty_as, g_as.head(0))
|
||||
assert_frame_equal(empty_as, g_as.tail(0))
|
||||
assert_frame_equal(empty_as, g_as.head(-1))
|
||||
assert_frame_equal(empty_as, g_as.tail(-1))
|
||||
|
||||
assert_frame_equal(df_as, g_as.head(7)) # contains all
|
||||
assert_frame_equal(df_as, g_as.tail(7))
|
||||
|
||||
# test with selection
|
||||
assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
|
||||
assert_frame_equal(g_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]])
|
||||
assert_frame_equal(g_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]])
|
||||
assert_frame_equal(g_as[["A", "B"]].head(1), df_as.loc[[0, 2]])
|
||||
|
||||
assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
|
||||
assert_frame_equal(g_not_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]])
|
||||
assert_frame_equal(g_not_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]])
|
||||
assert_frame_equal(g_not_as[["A", "B"]].head(1), df_as.loc[[0, 2]])
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
expected = df.iloc[[0, 2]].set_index("A")
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
assert_frame_equal(result1, df)
|
||||
assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
assert_frame_equal(result1, df)
|
||||
assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
assert_frame_equal(result1, expected)
|
||||
assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
assert_frame_equal(result1, expected)
|
||||
assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
||||
result = df.groupby("a").nth(10)
|
||||
expected = DataFrame(index=Index([], name="a"), columns=["b", "c"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["a", "b"]).nth(10)
|
||||
expected = DataFrame(
|
||||
index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"]
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame(
|
||||
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
||||
columns=["A", "C", "B"],
|
||||
)
|
||||
result = df.groupby("A").nth(0)
|
||||
expected = DataFrame(
|
||||
[["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A")
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").nth(-1, dropna="any")
|
||||
expected = DataFrame(
|
||||
[["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A")
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper(dropna):
|
||||
# GH 26011
|
||||
df = DataFrame(
|
||||
[[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]],
|
||||
columns=list("abc"),
|
||||
)
|
||||
result = df.groupby("a").nth(0, dropna=dropna)
|
||||
expected = pd.DataFrame(
|
||||
[[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a")
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,444 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, concat
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = tm.rands_array(10, 100)
|
||||
lev2 = tm.rands_array(10, 130)
|
||||
lab1 = np.random.randint(0, 100, size=500)
|
||||
lab2 = np.random.randint(0, 130, size=500)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": np.random.randn(500),
|
||||
"key1": lev1.take(lab1),
|
||||
"key2": lev2.take(lab2),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank()
|
||||
|
||||
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
||||
|
||||
expected = [
|
||||
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
||||
]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
[2, 2, 8, 2, 6],
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,pct,exp",
|
||||
[
|
||||
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
||||
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
||||
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
||||
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
||||
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
||||
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
||||
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
||||
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
||||
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
||||
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
||||
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
||||
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
||||
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
||||
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,exp",
|
||||
[
|
||||
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
||||
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
||||
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
||||
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
||||
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
||||
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
||||
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
||||
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
||||
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
||||
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
||||
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
||||
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
||||
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
||||
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
||||
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
||||
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
||||
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
||||
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
||||
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
||||
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
||||
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
||||
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
||||
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
||||
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
||||
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
||||
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
||||
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option
|
||||
)
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,pct,exp",
|
||||
[
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
||||
),
|
||||
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
||||
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
||||
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
1.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
np.nan,
|
||||
3.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
3.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
np.nan,
|
||||
1.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
||||
),
|
||||
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
||||
),
|
||||
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
||||
),
|
||||
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
||||
),
|
||||
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
||||
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
||||
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
||||
),
|
||||
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
||||
),
|
||||
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
||||
),
|
||||
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
||||
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
||||
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
||||
)
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
||||
)
|
||||
result = df.groupby("key").rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
def test_rank_avg_even_vals():
|
||||
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
||||
result = df.groupby("key").rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
||||
)
|
||||
def test_rank_object_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
|
||||
with pytest.raises(TypeError, match="not callable"):
|
||||
df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["bar", "bar", "foo", "bar", "baz"],
|
||||
["bar", np.nan, "foo", np.nan, "baz"],
|
||||
[1, np.nan, 2, np.nan, 3],
|
||||
],
|
||||
)
|
||||
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_key,input_value,output_value",
|
||||
[
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,759 @@
|
||||
""" test with the TimeGrouper / grouping with datetimes """
|
||||
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
|
||||
from pandas.core.groupby.grouper import Grouper
|
||||
from pandas.core.groupby.ops import BinGrouper
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestGroupBy:
|
||||
def test_groupby_with_timegrouper(self):
|
||||
# GH 4161
|
||||
# TimeGrouper requires a sorted index
|
||||
# also verifies that the resultant index has the correct name
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# GH 6908 change target column's order
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
df = df.set_index(["Date"])
|
||||
|
||||
expected = DataFrame(
|
||||
{"Quantity": 0},
|
||||
index=date_range(
|
||||
"20130901", "20131205", freq="5D", name="Date", closed="left"
|
||||
),
|
||||
)
|
||||
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64")
|
||||
|
||||
result1 = df.resample("5D").sum()
|
||||
assert_frame_equal(result1, expected)
|
||||
|
||||
df_sorted = df.sort_index()
|
||||
result2 = df_sorted.groupby(pd.Grouper(freq="5D")).sum()
|
||||
assert_frame_equal(result2, expected)
|
||||
|
||||
result3 = df.groupby(pd.Grouper(freq="5D")).sum()
|
||||
assert_frame_equal(result3, expected)
|
||||
|
||||
@pytest.mark.parametrize("should_sort", [True, False])
|
||||
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||||
# GH 3881
|
||||
# make sure API of timegrouper conforms
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 8, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if should_sort:
|
||||
df = df.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
df = df.set_index("Date", drop=False)
|
||||
g = df.groupby(pd.Grouper(freq="6M"))
|
||||
assert g.group_keys
|
||||
|
||||
assert isinstance(g.grouper, BinGrouper)
|
||||
groups = g.groups
|
||||
assert isinstance(groups, dict)
|
||||
assert len(groups) == 3
|
||||
|
||||
def test_timegrouper_with_reg_groups(self):
|
||||
|
||||
# GH 3794
|
||||
# allow combination of timegrouper/reg groups
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 13, 0),
|
||||
datetime(2013, 1, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 12, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
|
||||
for df in [df_original, df_sorted]:
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
datetime(2013, 12, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([pd.Grouper(freq="A"), "Buyer"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Mark Carl Joe".split(),
|
||||
"Quantity": [1, 3, 9, 18],
|
||||
"Date": [
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 1, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
datetime(2013, 7, 1, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
result = df.groupby([pd.Grouper(freq="6MS"), "Buyer"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Branch": "A A A A A A A B".split(),
|
||||
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
|
||||
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 13, 0),
|
||||
datetime(2013, 10, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 2, 10, 0),
|
||||
datetime(2013, 10, 2, 12, 0),
|
||||
datetime(2013, 10, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
).set_index("Date")
|
||||
|
||||
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
|
||||
for df in [df_original, df_sorted]:
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark Carl Joe".split(),
|
||||
"Quantity": [6, 8, 3, 4, 10],
|
||||
"Date": [
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 1, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
datetime(2013, 10, 2, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
|
||||
result = df.groupby([pd.Grouper(freq="1D"), "Buyer"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([pd.Grouper(freq="1M"), "Buyer"]).sum()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
datetime(2013, 10, 31, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# passing the name
|
||||
df = df.reset_index()
|
||||
result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
|
||||
df.groupby([pd.Grouper(freq="1M", key="foo"), "Buyer"]).sum()
|
||||
|
||||
# passing the level
|
||||
df = df.set_index("Date")
|
||||
result = df.groupby([pd.Grouper(freq="1M", level="Date"), "Buyer"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum()
|
||||
|
||||
# multi names
|
||||
df = df.copy()
|
||||
df["Date"] = df.index + pd.offsets.MonthEnd(2)
|
||||
result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Mark".split(),
|
||||
"Quantity": [10, 18, 3],
|
||||
"Date": [
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
datetime(2013, 11, 30, 0, 0),
|
||||
],
|
||||
}
|
||||
).set_index(["Date", "Buyer"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# error as we have both a level and a name!
|
||||
with pytest.raises(ValueError):
|
||||
df.groupby(
|
||||
[pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"]
|
||||
).sum()
|
||||
|
||||
# single groupers
|
||||
expected = DataFrame(
|
||||
{"Quantity": [31], "Date": [datetime(2013, 10, 31, 0, 0)]}
|
||||
).set_index("Date")
|
||||
result = df.groupby(pd.Grouper(freq="1M")).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([pd.Grouper(freq="1M")]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{"Quantity": [31], "Date": [datetime(2013, 11, 30, 0, 0)]}
|
||||
).set_index("Date")
|
||||
result = df.groupby(pd.Grouper(freq="1M", key="Date")).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby([pd.Grouper(freq="1M", key="Date")]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"])
|
||||
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||||
# GH 6764 multiple grouping with/without sort
|
||||
df = DataFrame(
|
||||
{
|
||||
"date": pd.to_datetime(
|
||||
[
|
||||
"20121002",
|
||||
"20121007",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20121002",
|
||||
"20121207",
|
||||
"20130130",
|
||||
"20130202",
|
||||
"20130305",
|
||||
"20130202",
|
||||
"20130305",
|
||||
]
|
||||
),
|
||||
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||||
"whole_cost": [
|
||||
1790,
|
||||
364,
|
||||
280,
|
||||
259,
|
||||
201,
|
||||
623,
|
||||
90,
|
||||
312,
|
||||
359,
|
||||
301,
|
||||
359,
|
||||
801,
|
||||
],
|
||||
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
|
||||
}
|
||||
).set_index("date")
|
||||
|
||||
expected = (
|
||||
df.groupby("user_id")["whole_cost"]
|
||||
.resample(freq)
|
||||
.sum(min_count=1) # XXX
|
||||
.dropna()
|
||||
.reorder_levels(["date", "user_id"])
|
||||
.sort_index()
|
||||
.astype("int64")
|
||||
)
|
||||
expected.name = "whole_cost"
|
||||
|
||||
result1 = (
|
||||
df.sort_index()
|
||||
.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"]
|
||||
.sum()
|
||||
)
|
||||
assert_series_equal(result1, expected)
|
||||
|
||||
result2 = df.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"].sum()
|
||||
assert_series_equal(result2, expected)
|
||||
|
||||
def test_timegrouper_get_group(self):
|
||||
# GH 6914
|
||||
|
||||
df_original = DataFrame(
|
||||
{
|
||||
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
|
||||
"Quantity": [18, 3, 5, 1, 9, 3],
|
||||
"Date": [
|
||||
datetime(2013, 9, 1, 13, 0),
|
||||
datetime(2013, 9, 1, 13, 5),
|
||||
datetime(2013, 10, 1, 20, 0),
|
||||
datetime(2013, 10, 3, 10, 0),
|
||||
datetime(2013, 12, 2, 12, 0),
|
||||
datetime(2013, 9, 2, 14, 0),
|
||||
],
|
||||
}
|
||||
)
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
# single grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(pd.Grouper(freq="M", key="Date"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = pd.Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# multiple grouping
|
||||
expected_list = [
|
||||
df_original.iloc[[1]],
|
||||
df_original.iloc[[3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(["Buyer", pd.Grouper(freq="M", key="Date")])
|
||||
for (b, t), expected in zip(g_list, expected_list):
|
||||
dt = pd.Timestamp(t)
|
||||
result = grouped.get_group((b, dt))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# with index
|
||||
df_original = df_original.set_index("Date")
|
||||
df_reordered = df_original.sort_values(by="Quantity")
|
||||
|
||||
expected_list = [
|
||||
df_original.iloc[[0, 1, 5]],
|
||||
df_original.iloc[[2, 3]],
|
||||
df_original.iloc[[4]],
|
||||
]
|
||||
|
||||
for df in [df_original, df_reordered]:
|
||||
grouped = df.groupby(pd.Grouper(freq="M"))
|
||||
for t, expected in zip(dt_list, expected_list):
|
||||
dt = pd.Timestamp(t)
|
||||
result = grouped.get_group(dt)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_timegrouper_apply_return_type_series(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_series(x):
|
||||
return pd.Series([x["value"].sum()], ("sum",))
|
||||
|
||||
expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_series)
|
||||
result = df_dt.groupby(pd.Grouper(freq="M", key="date")).apply(sumfunc_series)
|
||||
assert_frame_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_timegrouper_apply_return_type_value(self):
|
||||
# Using `apply` with the `TimeGrouper` should give the
|
||||
# same return type as an `apply` with a `Grouper`.
|
||||
# Issue #11742
|
||||
df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
|
||||
df_dt = df.copy()
|
||||
df_dt["date"] = pd.to_datetime(df_dt["date"])
|
||||
|
||||
def sumfunc_value(x):
|
||||
return x.value.sum()
|
||||
|
||||
expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_value)
|
||||
result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
|
||||
assert_series_equal(
|
||||
result.reset_index(drop=True), expected.reset_index(drop=True)
|
||||
)
|
||||
|
||||
def test_groupby_groups_datetimeindex(self):
|
||||
# GH#1430
|
||||
periods = 1000
|
||||
ind = pd.date_range(start="2012/1/1", freq="5min", periods=periods)
|
||||
df = DataFrame(
|
||||
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
|
||||
)
|
||||
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||||
|
||||
# it works!
|
||||
groups = grouped.groups
|
||||
assert isinstance(list(groups.keys())[0], datetime)
|
||||
|
||||
# GH#11442
|
||||
index = pd.date_range("2015/01/01", periods=5, name="date")
|
||||
df = pd.DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
|
||||
result = df.groupby(level="date").groups
|
||||
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
|
||||
expected = {
|
||||
pd.Timestamp(date): pd.DatetimeIndex([date], name="date") for date in dates
|
||||
}
|
||||
tm.assert_dict_equal(result, expected)
|
||||
|
||||
grouped = df.groupby(level="date")
|
||||
for date in dates:
|
||||
result = grouped.get_group(date)
|
||||
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
|
||||
expected_index = pd.DatetimeIndex([date], name="date")
|
||||
expected = pd.DataFrame(data, columns=list("AB"), index=expected_index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_datetimeindex_tz(self):
|
||||
# GH 3950
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"datetime": dates,
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
|
||||
|
||||
exp_idx1 = pd.DatetimeIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
tz="US/Pacific",
|
||||
name="datetime",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["datetime", "label"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.DatetimeIndex(dates, tz="Asia/Tokyo")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = pd.DatetimeIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
tz="Asia/Tokyo",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_datetime64_handling_groupby(self):
|
||||
# it works!
|
||||
df = DataFrame(
|
||||
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
|
||||
columns=["a", "date"],
|
||||
)
|
||||
result = df.groupby("a").first()
|
||||
assert result["date"][3] == Timestamp("2012-07-03")
|
||||
|
||||
def test_groupby_multi_timezone(self):
|
||||
|
||||
# combining multiple / different timezones yields UTC
|
||||
|
||||
data = """0,2000-01-28 16:47:00,America/Chicago
|
||||
1,2000-01-29 16:48:00,America/Chicago
|
||||
2,2000-01-30 16:49:00,America/Los_Angeles
|
||||
3,2000-01-31 16:50:00,America/Chicago
|
||||
4,2000-01-01 16:50:00,America/New_York"""
|
||||
|
||||
df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
|
||||
result = df.groupby("tz").date.apply(
|
||||
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
|
||||
)
|
||||
|
||||
expected = Series(
|
||||
[
|
||||
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
|
||||
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
|
||||
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
|
||||
],
|
||||
name="date",
|
||||
dtype=object,
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
tz = "America/Chicago"
|
||||
res_values = df.groupby("tz").date.get_group(tz)
|
||||
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||||
exp_values = Series(
|
||||
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
|
||||
index=[0, 1, 3],
|
||||
name="date",
|
||||
)
|
||||
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_groups_periods(self):
|
||||
dates = [
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
]
|
||||
df = DataFrame(
|
||||
{
|
||||
"label": ["a", "a", "a", "b", "b", "b"],
|
||||
"period": [pd.Period(d, freq="H") for d in dates],
|
||||
"value1": np.arange(6, dtype="int64"),
|
||||
"value2": [1, 2] * 3,
|
||||
}
|
||||
)
|
||||
|
||||
exp_idx1 = pd.PeriodIndex(
|
||||
[
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 07:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 08:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
"2011-07-19 09:00:00",
|
||||
],
|
||||
freq="H",
|
||||
name="period",
|
||||
)
|
||||
exp_idx2 = Index(["a", "b"] * 3, name="label")
|
||||
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||||
expected = DataFrame(
|
||||
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(["period", "label"]).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# by level
|
||||
didx = pd.PeriodIndex(dates, freq="H")
|
||||
df = DataFrame(
|
||||
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
|
||||
index=didx,
|
||||
)
|
||||
|
||||
exp_idx = pd.PeriodIndex(
|
||||
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||||
freq="H",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
result = df.groupby(level=0).sum()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_groupby_first_datetime64(self):
|
||||
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||||
df[1] = df[1].view("M8[ns]")
|
||||
|
||||
assert issubclass(df[1].dtype.type, np.datetime64)
|
||||
|
||||
result = df.groupby(level=0).first()
|
||||
got_dt = result[1].dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
result = df[1].groupby(level=0).first()
|
||||
got_dt = result.dtype
|
||||
assert issubclass(got_dt.type, np.datetime64)
|
||||
|
||||
def test_groupby_max_datetime64(self):
|
||||
# GH 5869
|
||||
# datetimelike dtype conversion from int
|
||||
df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5)))
|
||||
expected = df.groupby("A")["A"].apply(lambda x: x.max())
|
||||
result = df.groupby("A")["A"].max()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_datetime64_32_bit(self):
|
||||
# GH 6410 / numpy 4328
|
||||
# 32-bit under 1.9-dev indexing issue
|
||||
|
||||
df = DataFrame({"A": range(2), "B": [pd.Timestamp("2000-01-1")] * 2})
|
||||
result = df.groupby("A")["B"].transform(min)
|
||||
expected = Series([pd.Timestamp("2000-01-1")] * 2, name="B")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_groupby_with_timezone_selection(self):
|
||||
# GH 11616
|
||||
# Test that column selection returns output in correct timezone.
|
||||
np.random.seed(42)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"factor": np.random.randint(0, 3, size=60),
|
||||
"time": pd.date_range(
|
||||
"01/01/2000 00:00", periods=60, freq="s", tz="UTC"
|
||||
),
|
||||
}
|
||||
)
|
||||
df1 = df.groupby("factor").max()["time"]
|
||||
df2 = df.groupby("factor")["time"].max()
|
||||
tm.assert_series_equal(df1, df2)
|
||||
|
||||
def test_timezone_info(self):
|
||||
# see gh-11682: Timezone info lost when broadcasting
|
||||
# scalar datetime to DataFrame
|
||||
|
||||
df = pd.DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
df = pd.DataFrame({"a": [1, 2, 3]})
|
||||
df["b"] = datetime.now(pytz.utc)
|
||||
assert df["b"][0].tzinfo == pytz.utc
|
||||
|
||||
def test_datetime_count(self):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3] * 2, "dates": pd.date_range("now", periods=6, freq="T")}
|
||||
)
|
||||
result = df.groupby("a").dates.count()
|
||||
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_first_last_max_min_on_time_data(self):
|
||||
# GH 10295
|
||||
# Verify that NaT is not in the result of max, min, first and last on
|
||||
# Dataframe with datetime or timedelta values.
|
||||
from datetime import timedelta as td
|
||||
|
||||
df_test = DataFrame(
|
||||
{
|
||||
"dt": [
|
||||
nan,
|
||||
"2015-07-24 10:10",
|
||||
"2015-07-25 11:11",
|
||||
"2015-07-23 12:12",
|
||||
nan,
|
||||
],
|
||||
"td": [nan, td(days=1), td(days=2), td(days=3), nan],
|
||||
}
|
||||
)
|
||||
df_test.dt = pd.to_datetime(df_test.dt)
|
||||
df_test["group"] = "A"
|
||||
df_ref = df_test[df_test.dt.notna()]
|
||||
|
||||
grouped_test = df_test.groupby("group")
|
||||
grouped_ref = df_ref.groupby("group")
|
||||
|
||||
assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||||
assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||||
assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||||
assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||||
|
||||
def test_nunique_with_timegrouper_and_nat(self):
|
||||
# GH 17575
|
||||
test = pd.DataFrame(
|
||||
{
|
||||
"time": [
|
||||
Timestamp("2016-06-28 09:35:35"),
|
||||
pd.NaT,
|
||||
Timestamp("2016-06-28 16:46:28"),
|
||||
],
|
||||
"data": ["1", "2", "3"],
|
||||
}
|
||||
)
|
||||
|
||||
grouper = pd.Grouper(key="time", freq="h")
|
||||
result = test.groupby(grouper)["data"].nunique()
|
||||
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_scalar_call_versus_list_call(self):
|
||||
# Issue: 17530
|
||||
data_frame = {
|
||||
"location": ["shanghai", "beijing", "shanghai"],
|
||||
"time": pd.Series(
|
||||
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
|
||||
dtype="datetime64[ns]",
|
||||
),
|
||||
"value": [1, 2, 3],
|
||||
}
|
||||
data_frame = pd.DataFrame(data_frame).set_index("time")
|
||||
grouper = pd.Grouper(freq="D")
|
||||
|
||||
grouped = data_frame.groupby(grouper)
|
||||
result = grouped.count()
|
||||
grouped = data_frame.groupby([grouper])
|
||||
expected = grouped.count()
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
these are systematically testing all of the args to value_counts
|
||||
with different size combinations. This is to ensure stability of the sorting
|
||||
and proper parameter handling
|
||||
"""
|
||||
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, MultiIndex, Series, date_range
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
# our starting frame
|
||||
def seed_df(seed_nans, n, m):
|
||||
np.random.seed(1234)
|
||||
days = date_range("2015-08-24", periods=10)
|
||||
|
||||
frame = DataFrame(
|
||||
{
|
||||
"1st": np.random.choice(list("abcd"), n),
|
||||
"2nd": np.random.choice(days, n),
|
||||
"3rd": np.random.randint(1, m + 1, n),
|
||||
}
|
||||
)
|
||||
|
||||
if seed_nans:
|
||||
frame.loc[1::11, "1st"] = np.nan
|
||||
frame.loc[3::17, "2nd"] = np.nan
|
||||
frame.loc[7::19, "3rd"] = np.nan
|
||||
frame.loc[8::19, "3rd"] = np.nan
|
||||
frame.loc[9::19, "3rd"] = np.nan
|
||||
|
||||
return frame
|
||||
|
||||
|
||||
# create input df, keys, and the bins
|
||||
binned = []
|
||||
ids = []
|
||||
for seed_nans in [True, False]:
|
||||
for n, m in product((100, 1000), (5, 20)):
|
||||
|
||||
df = seed_df(seed_nans, n, m)
|
||||
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
|
||||
keys = "1st", "2nd", ["1st", "2nd"]
|
||||
for k, b in product(keys, bins):
|
||||
binned.append((df, k, b, n, m))
|
||||
ids.append("{}-{}-{}".format(k, n, m))
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
|
||||
def test_series_groupby_value_counts(df, keys, bins, n, m):
|
||||
def rebuild_index(df):
|
||||
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
|
||||
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
|
||||
return df
|
||||
|
||||
for isort, normalize, sort, ascending, dropna in product((False, True), repeat=5):
|
||||
|
||||
kwargs = dict(
|
||||
normalize=normalize,
|
||||
sort=sort,
|
||||
ascending=ascending,
|
||||
dropna=dropna,
|
||||
bins=bins,
|
||||
)
|
||||
|
||||
gr = df.groupby(keys, sort=isort)
|
||||
left = gr["3rd"].value_counts(**kwargs)
|
||||
|
||||
gr = df.groupby(keys, sort=isort)
|
||||
right = gr["3rd"].apply(Series.value_counts, **kwargs)
|
||||
right.index.names = right.index.names[:-1] + ["3rd"]
|
||||
|
||||
# have to sort on index because of unstable sort on values
|
||||
left, right = map(rebuild_index, (left, right)) # xref GH9212
|
||||
tm.assert_series_equal(left.sort_index(), right.sort_index())
|
||||
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
test methods relating to generic function evaluation
|
||||
the so-called white/black lists
|
||||
"""
|
||||
|
||||
from string import ascii_lowercase
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, date_range
|
||||
from pandas.util import testing as tm
|
||||
|
||||
AGG_FUNCTIONS = [
|
||||
"sum",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
"median",
|
||||
"mean",
|
||||
"skew",
|
||||
"mad",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
]
|
||||
AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"]
|
||||
|
||||
df_whitelist = [
|
||||
"quantile",
|
||||
"fillna",
|
||||
"mad",
|
||||
"take",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"tshift",
|
||||
"skew",
|
||||
"plot",
|
||||
"hist",
|
||||
"dtypes",
|
||||
"corrwith",
|
||||
"corr",
|
||||
"cov",
|
||||
"diff",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=df_whitelist)
|
||||
def df_whitelist_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
s_whitelist = [
|
||||
"quantile",
|
||||
"fillna",
|
||||
"mad",
|
||||
"take",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"tshift",
|
||||
"skew",
|
||||
"plot",
|
||||
"hist",
|
||||
"dtype",
|
||||
"corr",
|
||||
"cov",
|
||||
"diff",
|
||||
"unique",
|
||||
"nlargest",
|
||||
"nsmallest",
|
||||
"is_monotonic_increasing",
|
||||
"is_monotonic_decreasing",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(params=s_whitelist)
|
||||
def s_whitelist_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mframe():
|
||||
index = MultiIndex(
|
||||
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_letters():
|
||||
letters = np.array(list(ascii_lowercase))
|
||||
N = 10
|
||||
random_letters = letters.take(np.random.randint(0, 26, N))
|
||||
df = DataFrame(
|
||||
{
|
||||
"floats": N / 10 * Series(np.random.random(N)),
|
||||
"letters": Series(random_letters),
|
||||
}
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.mark.parametrize("whitelist", [df_whitelist, s_whitelist])
|
||||
def test_groupby_whitelist(df_letters, whitelist):
|
||||
df = df_letters
|
||||
if whitelist == df_whitelist:
|
||||
# dataframe
|
||||
obj = df_letters
|
||||
else:
|
||||
obj = df_letters["floats"]
|
||||
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
assert set(whitelist) == set(gb._apply_whitelist)
|
||||
|
||||
|
||||
def check_whitelist(obj, df, m):
|
||||
# check the obj for a particular whitelist m
|
||||
|
||||
gb = obj.groupby(df.letters)
|
||||
|
||||
f = getattr(type(gb), m)
|
||||
|
||||
# name
|
||||
try:
|
||||
n = f.__name__
|
||||
except AttributeError:
|
||||
return
|
||||
assert n == m
|
||||
|
||||
# qualname
|
||||
try:
|
||||
n = f.__qualname__
|
||||
except AttributeError:
|
||||
return
|
||||
assert n.endswith(m)
|
||||
|
||||
|
||||
def test_groupby_series_whitelist(df_letters, s_whitelist_fixture):
|
||||
m = s_whitelist_fixture
|
||||
df = df_letters
|
||||
check_whitelist(df.letters, df, m)
|
||||
|
||||
|
||||
def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture):
|
||||
m = df_whitelist_fixture
|
||||
df = df_letters
|
||||
check_whitelist(df, df, m)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def raw_frame():
|
||||
index = MultiIndex(
|
||||
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
raw_frame = DataFrame(
|
||||
np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")
|
||||
)
|
||||
raw_frame.iloc[1, [1, 2]] = np.nan
|
||||
raw_frame.iloc[7, [0, 1]] = np.nan
|
||||
return raw_frame
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", AGG_FUNCTIONS)
|
||||
@pytest.mark.parametrize("level", [0, 1])
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("skipna", [True, False])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort):
|
||||
# GH6944
|
||||
# GH 17537
|
||||
# explicitly test the whitelist methods
|
||||
|
||||
if axis == 0:
|
||||
frame = raw_frame
|
||||
else:
|
||||
frame = raw_frame.T
|
||||
|
||||
if op in AGG_FUNCTIONS_WITH_SKIPNA:
|
||||
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
||||
result = getattr(grouped, op)(skipna=skipna)
|
||||
expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
|
||||
if sort:
|
||||
expected = expected.sort_index(axis=axis, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
||||
result = getattr(grouped, op)()
|
||||
expected = getattr(frame, op)(level=level, axis=axis)
|
||||
if sort:
|
||||
expected = expected.sort_index(axis=axis, level=level)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_blacklist(df_letters):
|
||||
df = df_letters
|
||||
s = df_letters.floats
|
||||
|
||||
blacklist = [
|
||||
"eval",
|
||||
"query",
|
||||
"abs",
|
||||
"where",
|
||||
"mask",
|
||||
"align",
|
||||
"groupby",
|
||||
"clip",
|
||||
"astype",
|
||||
"at",
|
||||
"combine",
|
||||
"consolidate",
|
||||
"convert_objects",
|
||||
]
|
||||
to_methods = [method for method in dir(df) if method.startswith("to_")]
|
||||
|
||||
blacklist.extend(to_methods)
|
||||
|
||||
# e.g., to_csv
|
||||
defined_but_not_allowed = "(?:^Cannot.+{0!r}.+{1!r}.+try using the 'apply' method$)"
|
||||
|
||||
# e.g., query, eval
|
||||
not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
|
||||
fmt = defined_but_not_allowed + "|" + not_defined
|
||||
for bl in blacklist:
|
||||
for obj in (df, s):
|
||||
gb = obj.groupby(df.letters)
|
||||
msg = fmt.format(bl, type(gb).__name__)
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
getattr(gb, bl)
|
||||
|
||||
|
||||
def test_tab_completion(mframe):
|
||||
grp = mframe.groupby(level="second")
|
||||
results = {v for v in dir(grp) if not v.startswith("_")}
|
||||
expected = {
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"agg",
|
||||
"aggregate",
|
||||
"apply",
|
||||
"boxplot",
|
||||
"filter",
|
||||
"first",
|
||||
"get_group",
|
||||
"groups",
|
||||
"hist",
|
||||
"indices",
|
||||
"last",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"ngroups",
|
||||
"nth",
|
||||
"ohlc",
|
||||
"plot",
|
||||
"prod",
|
||||
"size",
|
||||
"std",
|
||||
"sum",
|
||||
"transform",
|
||||
"var",
|
||||
"sem",
|
||||
"count",
|
||||
"nunique",
|
||||
"head",
|
||||
"describe",
|
||||
"cummax",
|
||||
"quantile",
|
||||
"rank",
|
||||
"cumprod",
|
||||
"tail",
|
||||
"resample",
|
||||
"cummin",
|
||||
"fillna",
|
||||
"cumsum",
|
||||
"cumcount",
|
||||
"ngroup",
|
||||
"all",
|
||||
"shift",
|
||||
"skew",
|
||||
"take",
|
||||
"tshift",
|
||||
"pct_change",
|
||||
"any",
|
||||
"mad",
|
||||
"corr",
|
||||
"corrwith",
|
||||
"cov",
|
||||
"dtypes",
|
||||
"ndim",
|
||||
"diff",
|
||||
"idxmax",
|
||||
"idxmin",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"pad",
|
||||
"backfill",
|
||||
"rolling",
|
||||
"expanding",
|
||||
"pipe",
|
||||
}
|
||||
assert results == expected
|
||||
|
||||
|
||||
def test_groupby_function_rename(mframe):
|
||||
grp = mframe.groupby(level="second")
|
||||
for name in ["sum", "prod", "min", "max", "first", "last"]:
|
||||
f = getattr(grp, name)
|
||||
assert f.__name__ == name
|
||||
|
||||
|
||||
def test_groupby_selection_with_methods(df):
|
||||
# some methods which require DatetimeIndex
|
||||
rng = date_range("2014", periods=len(df))
|
||||
df.index = rng
|
||||
|
||||
g = df.groupby(["A"])[["C"]]
|
||||
g_exp = df[["C"]].groupby(df["A"])
|
||||
# TODO check groupby with > 1 col ?
|
||||
|
||||
# methods which are called as .foo()
|
||||
methods = [
|
||||
"count",
|
||||
"corr",
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"describe",
|
||||
"rank",
|
||||
"quantile",
|
||||
"diff",
|
||||
"shift",
|
||||
"all",
|
||||
"any",
|
||||
"idxmin",
|
||||
"idxmax",
|
||||
"ffill",
|
||||
"bfill",
|
||||
"pct_change",
|
||||
"tshift",
|
||||
]
|
||||
|
||||
for m in methods:
|
||||
res = getattr(g, m)()
|
||||
exp = getattr(g_exp, m)()
|
||||
|
||||
# should always be frames!
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# methods which aren't just .foo()
|
||||
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
|
||||
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
|
||||
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
|
||||
|
||||
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
|
||||
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
|
||||
|
||||
tm.assert_frame_equal(
|
||||
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
|
||||
)
|
||||
Reference in New Issue
Block a user