8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,562 @@
"""
test .agg behavior / note that .apply is tested generally in test_groupby.py
"""
from collections import OrderedDict
import functools
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
from pandas.core.base import SpecificationError
from pandas.core.groupby.generic import _maybe_mangle_lambdas
from pandas.core.groupby.grouper import Grouping
import pandas.util.testing as tm
def test_agg_regression1(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_agg_must_agg(df):
grouped = df.groupby("A")["C"]
msg = "Must produce aggregated value"
with pytest.raises(Exception, match=msg):
grouped.agg(lambda x: x.describe())
with pytest.raises(Exception, match=msg):
grouped.agg(lambda x: x.index[:2])
def test_agg_ser_multi_key(df):
# TODO(wesm): unused
ser = df.C # noqa
f = lambda x: x.sum()
results = df.C.groupby([df.A, df.B]).aggregate(f)
expected = df.groupby(["A", "B"]).sum()["C"]
tm.assert_series_equal(results, expected)
def test_groupby_aggregation_mixed_dtype():
# GH 6212
expected = DataFrame(
{
"v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
"v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
},
index=MultiIndex.from_tuples(
[
(1, 95),
(1, 99),
(2, 95),
(2, 99),
("big", "damp"),
("blue", "dry"),
("red", "red"),
("red", "wet"),
],
names=["by1", "by2"],
),
)
df = DataFrame(
{
"v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
"v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
"by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
"by2": [
"wet",
"dry",
99,
95,
np.nan,
"damp",
95,
99,
"red",
99,
np.nan,
np.nan,
],
}
)
g = df.groupby(["by1", "by2"])
result = g[["v1", "v2"]].mean()
tm.assert_frame_equal(result, expected)
def test_agg_apply_corner(ts, tsframe):
# nothing to group, all NA
grouped = ts.groupby(ts * np.nan)
assert ts.dtype == np.float64
# groupby float64 values results in Float64Index
exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64))
tm.assert_series_equal(grouped.sum(), exp)
tm.assert_series_equal(grouped.agg(np.sum), exp)
tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
# DataFrame
grouped = tsframe.groupby(tsframe["A"] * np.nan)
exp_df = DataFrame(
columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)
)
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
def test_agg_grouping_is_list_tuple(ts):
df = tm.makeTimeDataFrame()
grouped = df.groupby(lambda x: x.year)
grouper = grouped.grouper.groupings[0].grouper
grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_agg_python_multiindex(mframe):
grouped = mframe.groupby(["A", "B"])
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
)
def test_aggregate_str_func(tsframe, groupbyfunc):
grouped = tsframe.groupby(groupbyfunc)
# single series
result = grouped["A"].agg("std")
expected = grouped["A"].std()
tm.assert_series_equal(result, expected)
# group frame by function name
result = grouped.aggregate("var")
expected = grouped.var()
tm.assert_frame_equal(result, expected)
# group frame by function dict
result = grouped.agg(
OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]])
)
expected = DataFrame(
OrderedDict(
[
["A", grouped["A"].var()],
["B", grouped["B"].std()],
["C", grouped["C"].mean()],
["D", grouped["D"].sem()],
]
)
)
tm.assert_frame_equal(result, expected)
def test_aggregate_item_by_item(df):
grouped = df.groupby("A")
aggfun = lambda ser: ser.size
result = grouped.agg(aggfun)
foo = (df.A == "foo").sum()
bar = (df.A == "bar").sum()
K = len(result.columns)
# GH5782
# odd comparisons can result here, so cast to make easy
exp = pd.Series(
np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo"
)
tm.assert_series_equal(result.xs("foo"), exp)
exp = pd.Series(
np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar"
)
tm.assert_almost_equal(result.xs("bar"), exp)
def aggfun(ser):
return ser.size
result = DataFrame().groupby(df.A).agg(aggfun)
assert isinstance(result, DataFrame)
assert len(result) == 0
def test_wrap_agg_out(three_group):
grouped = three_group.groupby(["A", "B"])
def func(ser):
if ser.dtype == np.object:
raise TypeError
else:
return ser.sum()
result = grouped.aggregate(func)
exp_grouped = three_group.loc[:, three_group.columns != "C"]
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
tm.assert_frame_equal(result, expected)
def test_agg_multiple_functions_maintain_order(df):
# GH #610
funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
result = df.groupby("A")["C"].agg(funcs)
exp_cols = Index(["mean", "max", "min"])
tm.assert_index_equal(result.columns, exp_cols)
def test_multiple_functions_tuples_and_non_tuples(df):
# #1359
funcs = [("foo", "mean"), "std"]
ex_funcs = [("foo", "mean"), ("std", "std")]
result = df.groupby("A")["C"].agg(funcs)
expected = df.groupby("A")["C"].agg(ex_funcs)
tm.assert_frame_equal(result, expected)
result = df.groupby("A").agg(funcs)
expected = df.groupby("A").agg(ex_funcs)
tm.assert_frame_equal(result, expected)
def test_more_flexible_frame_multi_function(df):
grouped = df.groupby("A")
exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]]))
exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]]))
expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]])
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
# be careful
result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]))
expected = grouped.aggregate(
OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])
)
tm.assert_frame_equal(result, expected)
def foo(x):
return np.mean(x)
def bar(x):
return np.std(x, ddof=1)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
d = OrderedDict(
[["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]]
)
result = grouped.aggregate(d)
d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]])
expected = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
def test_multi_function_flexible_mix(df):
# GH #1268
grouped = df.groupby("A")
# Expected
d = OrderedDict(
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]]
)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
expected = grouped.aggregate(d)
# Test 1
d = OrderedDict(
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]]
)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
# Test 2
d = OrderedDict(
[["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]]
)
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)
def test_groupby_agg_coercing_bools():
# issue 14873
dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
gp = dat.groupby("a")
index = Index([1, 2], name="a")
result = gp["b"].aggregate(lambda x: (x != 0).all())
expected = Series([False, True], index=index, name="b")
tm.assert_series_equal(result, expected)
result = gp["c"].aggregate(lambda x: x.isnull().all())
expected = Series([True, False], index=index, name="c")
tm.assert_series_equal(result, expected)
def test_order_aggregate_multiple_funcs():
# GH 25692
df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
result = res.columns.levels[1]
expected = pd.Index(["sum", "max", "mean", "ohlc", "min"])
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
def test_uint64_type_handling(dtype, how):
# GH 26310
df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]})
expected = df.groupby("y").agg({"x": how})
df.x = df.x.astype(dtype)
result = df.groupby("y").agg({"x": how})
result.x = result.x.astype(np.int64)
tm.assert_frame_equal(result, expected, check_exact=True)
class TestNamedAggregationSeries:
def test_series_named_agg(self):
df = pd.Series([1, 2, 3, 4])
gr = df.groupby([0, 0, 1, 1])
result = gr.agg(a="sum", b="min")
expected = pd.DataFrame(
{"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1]
)
tm.assert_frame_equal(result, expected)
result = gr.agg(b="min", a="sum")
# sort for 35 and earlier
if compat.PY36:
expected = expected[["b", "a"]]
tm.assert_frame_equal(result, expected)
def test_no_args_raises(self):
gr = pd.Series([1, 2]).groupby([0, 1])
with pytest.raises(TypeError, match="Must provide"):
gr.agg()
# but we do allow this
result = gr.agg([])
expected = pd.DataFrame()
tm.assert_frame_equal(result, expected)
def test_series_named_agg_duplicates_raises(self):
# This is a limitation of the named agg implementation reusing
# aggregate_multiple_funcs. It could maybe be lifted in the future.
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
with pytest.raises(SpecificationError):
gr.agg(a="sum", b="sum")
def test_mangled(self):
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
result = gr.agg(a=lambda x: 0, b=lambda x: 1)
expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]})
tm.assert_frame_equal(result, expected)
class TestNamedAggregationDataFrame:
def test_agg_relabel(self):
df = pd.DataFrame(
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
)
result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
expected = pd.DataFrame(
{"a_max": [1, 3], "b_max": [6, 8]},
index=pd.Index(["a", "b"], name="group"),
columns=["a_max", "b_max"],
)
tm.assert_frame_equal(result, expected)
# order invariance
p98 = functools.partial(np.percentile, q=98)
result = df.groupby("group").agg(
b_min=("B", "min"),
a_min=("A", min),
a_mean=("A", np.mean),
a_max=("A", "max"),
b_max=("B", "max"),
a_98=("A", p98),
)
expected = pd.DataFrame(
{
"b_min": [5, 7],
"a_min": [0, 2],
"a_mean": [0.5, 2.5],
"a_max": [1, 3],
"b_max": [6, 8],
"a_98": [0.98, 2.98],
},
index=pd.Index(["a", "b"], name="group"),
columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
)
if not compat.PY36:
expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]]
tm.assert_frame_equal(result, expected)
def test_agg_relabel_non_identifier(self):
df = pd.DataFrame(
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
)
result = df.groupby("group").agg(**{"my col": ("A", "max")})
expected = pd.DataFrame(
{"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group")
)
tm.assert_frame_equal(result, expected)
def test_duplicate_raises(self):
# TODO: we currently raise on multiple lambdas. We could *maybe*
# update com.get_callable_name to append `_i` to each lambda.
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
with pytest.raises(SpecificationError, match="Function names"):
df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
def test_agg_relabel_with_level(self):
df = pd.DataFrame(
{"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]),
)
result = df.groupby(level=0).agg(
aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
)
expected = pd.DataFrame(
{"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)
def test_agg_relabel_other_raises(self):
df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
grouped = df.groupby("A")
match = "Must provide"
with pytest.raises(TypeError, match=match):
grouped.agg(foo=1)
with pytest.raises(TypeError, match=match):
grouped.agg()
with pytest.raises(TypeError, match=match):
grouped.agg(a=("B", "max"), b=(1, 2, 3))
def test_missing_raises(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
with pytest.raises(KeyError, match="Column 'C' does not exist"):
df.groupby("A").agg(c=("C", "sum"))
def test_agg_namedtuple(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
result = df.groupby("A").agg(
b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
)
expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
tm.assert_frame_equal(result, expected)
def test_mangled(self):
df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
expected = pd.DataFrame(
{"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A")
)
tm.assert_frame_equal(result, expected)
class TestLambdaMangling:
def test_maybe_mangle_lambdas_passthrough(self):
assert _maybe_mangle_lambdas("mean") == "mean"
assert _maybe_mangle_lambdas(lambda x: x).__name__ == "<lambda>"
# don't mangel single lambda.
assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "<lambda>"
def test_maybe_mangle_lambdas_listlike(self):
aggfuncs = [lambda x: 1, lambda x: 2]
result = _maybe_mangle_lambdas(aggfuncs)
assert result[0].__name__ == "<lambda_0>"
assert result[1].__name__ == "<lambda_1>"
assert aggfuncs[0](None) == result[0](None)
assert aggfuncs[1](None) == result[1](None)
def test_maybe_mangle_lambdas(self):
func = {"A": [lambda x: 0, lambda x: 1]}
result = _maybe_mangle_lambdas(func)
assert result["A"][0].__name__ == "<lambda_0>"
assert result["A"][1].__name__ == "<lambda_1>"
def test_maybe_mangle_lambdas_args(self):
func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]}
result = _maybe_mangle_lambdas(func)
assert result["A"][0].__name__ == "<lambda_0>"
assert result["A"][1].__name__ == "<lambda_1>"
assert func["A"][0](0, 1) == (0, 1, 1)
assert func["A"][0](0, 1, 2) == (0, 1, 2)
assert func["A"][0](0, 2, b=3) == (0, 2, 3)
def test_maybe_mangle_lambdas_named(self):
func = OrderedDict(
[("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))]
)
result = _maybe_mangle_lambdas(func)
assert result == func
def test_basic(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
expected = pd.DataFrame(
{("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
index=pd.Index([0, 1], name="A"),
)
tm.assert_frame_equal(result, expected)
def test_mangle_series_groupby(self):
gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
result = gr.agg([lambda x: 0, lambda x: 1])
expected = pd.DataFrame({"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]})
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
expected = pd.DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
tm.assert_frame_equal(result, expected)
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,238 @@
"""
test cython .agg behavior
"""
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range
from pandas.core.groupby.groupby import DataError
import pandas.util.testing as tm
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
pytest.param(
"median",
# ignore mean of empty slice
# and all-NaN
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
),
"prod",
"min",
"max",
],
)
def test_cythonized_aggers(op_name):
data = {
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
"B": ["A", "B"] * 6,
"C": np.random.randn(12),
}
df = DataFrame(data)
df.loc[2:10:2, "C"] = np.nan
op = lambda x: getattr(x, op_name)()
# single column
grouped = df.drop(["B"], axis=1).groupby("A")
exp = {cat: op(group["C"]) for cat, group in grouped}
exp = DataFrame({"C": exp})
exp.index.name = "A"
result = op(grouped)
tm.assert_frame_equal(result, exp)
# multiple columns
grouped = df.groupby(["A", "B"])
expd = {}
for (cat1, cat2), group in grouped:
expd.setdefault(cat1, {})[cat2] = op(group["C"])
exp = DataFrame(expd).T.stack(dropna=False)
exp.index.names = ["A", "B"]
exp.name = "C"
result = op(grouped)["C"]
if op_name in ["sum", "prod"]:
tm.assert_series_equal(result, exp)
def test_cython_agg_boolean():
frame = DataFrame(
{
"a": np.random.randint(0, 5, 50),
"b": np.random.randint(0, 2, 50).astype("bool"),
}
)
result = frame.groupby("a")["b"].mean()
expected = frame.groupby("a")["b"].agg(np.mean)
tm.assert_series_equal(result, expected)
def test_cython_agg_nothing_to_agg():
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame.groupby("a")["b"].mean()
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
with pytest.raises(DataError, match=msg):
frame[["b"]].groupby(frame["a"]).mean()
def test_cython_agg_nothing_to_agg_with_dates():
frame = DataFrame(
{
"a": np.random.randint(0, 5, 50),
"b": ["foo", "bar"] * 25,
"dates": pd.date_range("now", periods=50, freq="T"),
}
)
msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame.groupby("b").dates.mean()
def test_cython_agg_frame_columns():
# #2113
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
df.groupby(level=0, axis="columns").mean()
def test_cython_agg_return_dict():
# GH 16741
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
expected = Series(
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
index=Index(["bar", "foo"], name="A"),
name="B",
)
tm.assert_series_equal(ts, expected)
def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
expected = grouped.agg(np.sum)
tm.assert_series_equal(summed, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", np.median),
("var", np.var),
("add", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
],
)
def test__cython_agg_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)
result = df.groupby(labels)._cython_agg_general(op)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
("var", lambda x: np.var(x, ddof=1)),
("min", np.min),
("max", np.max),
],
)
def test_cython_agg_empty_buckets(op, targop, observed):
df = pd.DataFrame([11, 12, 13])
grps = range(0, 55, 5)
# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op)
g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
tm.assert_frame_equal(result, expected)
def test_cython_agg_empty_buckets_nanops(observed):
# GH-18869 can't call nanops on empty groups, so hardcode expected
# for these
df = pd.DataFrame([11, 12, 13], columns=["a"])
grps = range(0, 25, 5)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"add"
)
intervals = pd.interval_range(0, 20, freq=5)
expected = pd.DataFrame(
{"a": [0, 0, 36, 0]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 0]
tm.assert_frame_equal(result, expected)
# prod
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"prod"
)
expected = pd.DataFrame(
{"a": [1, 1, 1716, 1]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 1]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
@pytest.mark.parametrize(
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
)
def test_cython_with_timestamp_and_nat(op, data):
# https://github.com/pandas-dev/pandas/issues/19526
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
index = Index([0, 1], name="a")
# We will group by a and test the cython aggregations
expected = DataFrame({"b": [data, NaT]}, index=index)
result = df.groupby("a").aggregate(op)
tm.assert_frame_equal(expected, result)

View File

@@ -0,0 +1,607 @@
"""
test all other .agg behavior
"""
from collections import OrderedDict
import datetime as dt
from functools import partial
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
date_range,
period_range,
)
from pandas.core.groupby.groupby import SpecificationError
import pandas.util.testing as tm
from pandas.io.formats.printing import pprint_thing
def test_agg_api():
# GH 6337
# http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
# different api for agg when passed custom function with mixed frame
df = DataFrame(
{
"data1": np.random.randn(5),
"data2": np.random.randn(5),
"key1": ["a", "a", "b", "b", "a"],
"key2": ["one", "two", "one", "two", "one"],
}
)
grouped = df.groupby("key1")
def peak_to_peak(arr):
return arr.max() - arr.min()
expected = grouped.agg([peak_to_peak])
expected.columns = ["data1", "data2"]
result = grouped.agg(peak_to_peak)
tm.assert_frame_equal(result, expected)
def test_agg_datetimes_mixed():
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
df1 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
data = [
[
row[0],
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
row[2],
]
for row in data
]
df2 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
df1["weights"] = df1["value"] / df1["value"].sum()
gb1 = df1.groupby("date").aggregate(np.sum)
df2["weights"] = df1["value"] / df1["value"].sum()
gb2 = df2.groupby("date").aggregate(np.sum)
assert len(gb1) == len(gb2)
def test_agg_period_index():
prng = period_range("2012-1-1", freq="M", periods=3)
df = DataFrame(np.random.randn(3, 2), index=prng)
rs = df.groupby(level=0).sum()
assert isinstance(rs.index, PeriodIndex)
# GH 3579
index = period_range(start="1999-01", periods=5, freq="M")
s1 = Series(np.random.rand(len(index)), index=index)
s2 = Series(np.random.rand(len(index)), index=index)
series = [("s1", s1), ("s2", s2)]
df = DataFrame.from_dict(OrderedDict(series))
grouped = df.groupby(df.index.month)
list(grouped)
def test_agg_dict_parameter_cast_result_dtypes():
# GH 12821
df = DataFrame(
{
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
"time": date_range("1/1/2011", periods=8, freq="H"),
}
)
df.loc[[0, 1, 2, 5], "time"] = None
# test for `first` function
exp = df.loc[[0, 3, 4, 6]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.first(), exp)
tm.assert_frame_equal(grouped.agg("first"), exp)
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
tm.assert_series_equal(grouped.time.first(), exp["time"])
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.last(), exp)
tm.assert_frame_equal(grouped.agg("last"), exp)
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
tm.assert_series_equal(grouped.time.last(), exp["time"])
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
# count
exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.agg(len), exp)
tm.assert_series_equal(grouped.time.size(), exp)
exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.count(), exp)
def test_agg_cast_results_dtypes():
# similar to GH12821
# xref #11444
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
v = list("aaabbbbbbccd")
df = pd.DataFrame({"X": v, "Y": u})
result = df.groupby("X")["Y"].agg(len)
expected = df.groupby("X")["Y"].count()
tm.assert_series_equal(result, expected)
def test_aggregate_float64_no_int64():
# see gh-11199
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a"]].mean()
tm.assert_frame_equal(result, expected)
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a", "c"]].mean()
tm.assert_frame_equal(result, expected)
def test_aggregate_api_consistency():
# GH 9052
# make sure that the aggregates via dict
# are consistent
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
grouped = df.groupby(["A", "B"])
c_mean = grouped["C"].mean()
c_sum = grouped["C"].sum()
d_mean = grouped["D"].mean()
d_sum = grouped["D"].sum()
result = grouped["D"].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean], axis=1)
expected.columns = ["sum", "mean"]
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg([np.sum, np.mean])
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped[["D", "C"]].agg([np.sum, np.mean])
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": "mean", "D": "sum"})
expected = pd.concat([d_sum, c_mean], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]])
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_dict_renaming_deprecation():
# 15931
df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w:
df.groupby("A").agg(
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
)
assert "using a dict with renaming" in str(w[0].message)
assert "named aggregation" in str(w[0].message)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
with tm.assert_produces_warning(FutureWarning) as w:
df.groupby("A").B.agg({"foo": "count"})
assert "using a dict on a Series for aggregation" in str(w[0].message)
assert "named aggregation instead." in str(w[0].message)
def test_agg_compat():
# GH 12334
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")])
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g["D"].agg({"C": ["sum", "std"]})
tm.assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1)
expected.columns = ["C", "D"]
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g["D"].agg({"C": "sum", "D": "std"})
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_nested_dicts():
# API change for disallowing these types of nested dicts
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"cannot perform renaming for r[1-2] with a nested dictionary"
with pytest.raises(SpecificationError, match=msg):
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
expected = pd.concat(
[g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1
)
expected.columns = pd.MultiIndex.from_tuples(
[("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")]
)
tm.assert_frame_equal(result, expected, check_like=True)
# same name as the original column
# GH9052
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
expected = g["D"].agg({"result1": np.sum, "result2": np.mean})
expected = expected.rename(columns={"result1": "D"})
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g["D"].agg({"D": np.sum, "result2": np.mean})
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_item_by_item_raise_typeerror():
df = DataFrame(np.random.randint(10, size=(20, 10)))
def raiseException(df):
pprint_thing("----------------------------------------")
pprint_thing(df.to_string())
raise TypeError("test")
with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
def test_series_agg_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.sum)
expected = grouped.sum()
tm.assert_series_equal(result, expected)
def test_series_agg_multi_pure_python():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
def bad(x):
assert len(x.values.base) > 0
return "foo"
result = data.groupby(["A", "B"]).agg(bad)
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
tm.assert_frame_equal(result, expected)
def test_agg_consistency():
# agg with ([]) and () not consistent
# GH 6715
def P1(a):
try:
return np.percentile(a.dropna(), q=1)
except Exception:
return np.nan
df = DataFrame(
{
"col1": [1, 2, 3, 4],
"col2": [10, 25, 26, 31],
"date": [
dt.date(2013, 2, 10),
dt.date(2013, 2, 10),
dt.date(2013, 2, 11),
dt.date(2013, 2, 11),
],
}
)
g = df.groupby("date")
expected = g.agg([P1])
expected.columns = expected.columns.levels[0]
result = g.agg(P1)
tm.assert_frame_equal(result, expected)
def test_agg_callables():
# GH 7929
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
class fn_class:
def __call__(self, x):
return sum(x)
equiv_callables = [
sum,
np.sum,
lambda x: sum(x),
lambda x: x.sum(),
partial(sum),
fn_class(),
]
expected = df.groupby("foo").agg(sum)
for ecall in equiv_callables:
result = df.groupby("foo").agg(ecall)
tm.assert_frame_equal(result, expected)
def test_agg_over_numpy_arrays():
# GH 3788
df = pd.DataFrame(
[
[1, np.array([10, 20, 30])],
[1, np.array([40, 50, 60])],
[2, np.array([20, 30, 40])],
],
columns=["category", "arraydata"],
)
result = df.groupby("category").agg(sum)
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
expected_index = pd.Index([1, 2], name="category")
expected_column = ["arraydata"]
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_column
)
tm.assert_frame_equal(result, expected)
def test_agg_timezone_round_trip():
# GH 15426
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
df = pd.DataFrame(
{"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}
)
result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
result3 = df.groupby("a")["b"].min().iloc[0]
assert result1 == ts
assert result2 == ts
assert result3 == ts
dates = [
pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific")
for i in range(1, 5)
]
df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates})
grouped = df.groupby("A")
ts = df["B"].iloc[0]
assert ts == grouped.nth(0)["B"].iloc[0]
assert ts == grouped.head(1)["B"].iloc[0]
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
def test_sum_uint64_overflow():
# see gh-14758
# Convert to uint64 and don't overflow
df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
df = df + 9223372036854775807
index = pd.Index(
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
)
expected = pd.DataFrame(
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
index=index,
)
expected.index.name = 0
result = df.groupby(0).sum()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
(list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
(
lambda x: tuple(x),
pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
),
(
lambda x: list(x),
pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
),
],
)
def test_agg_structs_dataframe(structure, expected):
df = pd.DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby(["A", "B"]).aggregate(structure)
expected.index.names = ["A", "B"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
(lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
],
)
def test_agg_structs_series(structure, expected):
# Issue #18079
df = pd.DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby("A")["C"].aggregate(structure)
expected.index.name = "A"
tm.assert_series_equal(result, expected)
def test_agg_category_nansum(observed):
categories = ["a", "b", "c"]
df = pd.DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
)
result = df.groupby("A", observed=observed).B.agg(np.nansum)
expected = pd.Series(
[3, 3, 0],
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
name="B",
)
if observed:
expected = expected[expected != 0]
tm.assert_series_equal(result, expected)
def test_agg_list_like_func():
# GH 18473
df = pd.DataFrame(
{"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]}
)
grouped = df.groupby("A", as_index=False, sort=False)
result = grouped.agg({"B": lambda x: list(x)})
expected = pd.DataFrame(
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
)
tm.assert_frame_equal(result, expected)
def test_agg_lambda_with_timezone():
# GH 23683
df = pd.DataFrame(
{
"tag": [1, 1],
"date": [
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
],
}
)
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
expected = pd.DataFrame(
[pd.Timestamp("2018-01-01", tz="UTC")],
index=pd.Index([1], name="tag"),
columns=["date"],
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,104 @@
import numpy as np
import pytest
from pandas import DataFrame, MultiIndex
from pandas.util import testing as tm
@pytest.fixture
def mframe():
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
@pytest.fixture
def df():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
@pytest.fixture
def ts():
return tm.makeTimeSeries()
@pytest.fixture
def tsd():
return tm.getTimeSeriesData()
@pytest.fixture
def tsframe(tsd):
return DataFrame(tsd)
@pytest.fixture
def df_mixed_floats():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.array(np.random.randn(8), dtype="float32"),
}
)
@pytest.fixture
def three_group():
return DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)

View File

@@ -0,0 +1,659 @@
from datetime import datetime
from io import StringIO
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, bdate_range
from pandas.util import testing as tm
def test_apply_issues():
# GH 5788
s = """2011.05.16,00:00,1.40893
2011.05.16,01:00,1.40760
2011.05.16,02:00,1.40750
2011.05.16,03:00,1.40649
2011.05.17,02:00,1.40893
2011.05.17,03:00,1.40760
2011.05.17,04:00,1.40750
2011.05.17,05:00,1.40649
2011.05.18,02:00,1.40893
2011.05.18,03:00,1.40760
2011.05.18,04:00,1.40750
2011.05.18,05:00,1.40649"""
df = pd.read_csv(
StringIO(s),
header=None,
names=["date", "time", "value"],
parse_dates=[["date", "time"]],
)
df = df.set_index("date_time")
expected = df.groupby(df.index.date).idxmax()
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
tm.assert_frame_equal(result, expected)
# GH 5789
# don't auto coerce dates
df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"])
exp_idx = pd.Index(
["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
)
expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()])
tm.assert_series_equal(result, expected)
def test_apply_trivial():
# GH 20066
# trivial apply: ignore input and return a constant dataframe.
df = pd.DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"])
result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(
lambda x: df.iloc[1:]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(
reason="GH#20066; function passed into apply "
"returns a DataFrame with the same index "
"as the one to create GroupBy object."
)
def test_apply_trivial_fail():
# GH 20066
# trivial apply fails if the constant dataframe has the same index
# with the one used to create GroupBy object.
df = pd.DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
expected = pd.concat([df, df], axis=1, keys=["float64", "object"])
result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(lambda x: df)
tm.assert_frame_equal(result, expected)
def test_fast_apply():
# make sure that fast apply is correctly called
# rather than raising any kind of error
# otherwise the python path will be callsed
# which slows things down
N = 1000
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
df = DataFrame(
{
"key": labels,
"key2": labels2,
"value1": np.random.randn(N),
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
}
)
def f(g):
return 1
g = df.groupby(["key", "key2"])
grouper = g.grouper
splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
group_keys = grouper._get_group_keys()
values, mutated = splitter.fast_apply(f, group_keys)
assert not mutated
@pytest.mark.parametrize(
"df, group_names",
[
(DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]),
(DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]),
(DataFrame({"a": [1]}), [1]),
(DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]),
(DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]),
(
DataFrame(
{
"a": list("aaabbbcccc"),
"B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
"C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8],
}
),
["a", "b", "c"],
),
(DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]),
],
ids=[
"GH2936",
"GH7739 & GH10519",
"GH10519",
"GH2656",
"GH12155",
"GH20084",
"GH21417",
],
)
def test_group_apply_once_per_group(df, group_names):
# GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
# This test should ensure that a function is only evaluated
# once per group. Previously the function has been evaluated twice
# on the first group to check if the Cython index slider is safe to use
# This test ensures that the side effect (append to list) is only triggered
# once per group
names = []
# cannot parameterize over the functions since they need external
# `names` to detect side effects
def f_copy(group):
# this takes the fast apply path
names.append(group.name)
return group.copy()
def f_nocopy(group):
# this takes the slow apply path
names.append(group.name)
return group
def f_scalar(group):
# GH7739, GH2656
names.append(group.name)
return 0
def f_none(group):
# GH10519, GH12155, GH21417
names.append(group.name)
return None
def f_constant_df(group):
# GH2936, GH20084
names.append(group.name)
return DataFrame({"a": [1], "b": [1]})
for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
del names[:]
df.groupby("a").apply(func)
assert names == group_names
def test_apply_with_mixed_dtype():
# GH3480, apply with mixed dtype on axis=1 breaks in 0.11
df = DataFrame(
{
"foo1": np.random.randn(6),
"foo2": ["one", "two", "two", "three", "one", "two"],
}
)
result = df.apply(lambda x: x, axis=1).dtypes
expected = df.dtypes
tm.assert_series_equal(result, expected)
# GH 3610 incorrect dtype conversion with as_index=False
df = DataFrame({"c1": [1, 2, 6, 6, 8]})
df["c2"] = df.c1 / 2.0
result1 = df.groupby("c2").mean().reset_index().c2
result2 = df.groupby("c2", as_index=False).mean().c2
tm.assert_series_equal(result1, result2)
def test_groupby_as_index_apply(df):
# GH #4648 and #3417
df = DataFrame(
{
"item_id": ["b", "b", "a", "c", "a", "b"],
"user_id": [1, 2, 1, 1, 3, 1],
"time": range(6),
}
)
g_as = df.groupby("user_id", as_index=True)
g_not_as = df.groupby("user_id", as_index=False)
res_as = g_as.head(2).index
res_not_as = g_not_as.head(2).index
exp = Index([0, 1, 2, 4])
tm.assert_index_equal(res_as, exp)
tm.assert_index_equal(res_not_as, exp)
res_as_apply = g_as.apply(lambda x: x.head(2)).index
res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
# apply doesn't maintain the original ordering
# changed in GH5610 as the as_index=False returns a MI here
exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)])
tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None])
tm.assert_index_equal(res_as_apply, exp_as_apply)
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
ind = Index(list("abcde"))
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
res = df.groupby(0, as_index=False).apply(lambda x: x).index
tm.assert_index_equal(res, ind)
def test_apply_concat_preserve_names(three_group):
grouped = three_group.groupby(["A", "B"])
def desc(group):
result = group.describe()
result.index.name = "stat"
return result
def desc2(group):
result = group.describe()
result.index.name = "stat"
result = result[: len(group)]
# weirdo
return result
def desc3(group):
result = group.describe()
# names are different
result.index.name = "stat_{:d}".format(len(group))
result = result[: len(group)]
# weirdo
return result
result = grouped.apply(desc)
assert result.index.names == ("A", "B", "stat")
result2 = grouped.apply(desc2)
assert result2.index.names == ("A", "B", "stat")
result3 = grouped.apply(desc3)
assert result3.index.names == ("A", "B", None)
def test_apply_series_to_frame():
def f(piece):
with np.errstate(invalid="ignore"):
logged = np.log(piece)
return DataFrame(
{"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
)
dr = bdate_range("1/1/2000", periods=100)
ts = Series(np.random.randn(100), index=dr)
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(f)
assert isinstance(result, DataFrame)
tm.assert_index_equal(result.index, ts.index)
def test_apply_series_yield_constant(df):
result = df.groupby(["A", "B"])["C"].apply(len)
assert result.index.names[:2] == ("A", "B")
def test_apply_frame_yield_constant(df):
# GH13568
result = df.groupby(["A", "B"]).apply(len)
assert isinstance(result, Series)
assert result.name is None
result = df.groupby(["A", "B"])[["C", "D"]].apply(len)
assert isinstance(result, Series)
assert result.name is None
def test_apply_frame_to_series(df):
grouped = df.groupby(["A", "B"])
result = grouped.apply(len)
expected = grouped.count()["C"]
tm.assert_index_equal(result.index, expected.index)
tm.assert_numpy_array_equal(result.values, expected.values)
def test_apply_frame_concat_series():
def trans(group):
return group.groupby("B")["C"].sum().sort_values()[:2]
def trans2(group):
grouped = group.groupby(df.reindex(group.index)["B"])
return grouped.sum().sort_values()[:2]
df = DataFrame(
{
"A": np.random.randint(0, 5, 1000),
"B": np.random.randint(0, 5, 1000),
"C": np.random.randn(1000),
}
)
result = df.groupby("A").apply(trans)
exp = df.groupby("A")["C"].apply(trans2)
tm.assert_series_equal(result, exp, check_names=False)
assert result.name == "C"
def test_apply_transform(ts):
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x * 2)
expected = grouped.transform(lambda x: x * 2)
tm.assert_series_equal(result, expected)
def test_apply_multikey_corner(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
def f(group):
return group.sort_values("A")[-5:]
result = grouped.apply(f)
for key, group in grouped:
tm.assert_frame_equal(result.loc[key], f(group))
def test_apply_chunk_view():
# Low level tinkering could be unsafe, make sure not
df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
result = df.groupby("key", group_keys=False).apply(lambda x: x[:2])
expected = df.take([0, 1, 3, 4, 6, 7])
tm.assert_frame_equal(result, expected)
def test_apply_no_name_column_conflict():
df = DataFrame(
{
"name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
"name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
"value": range(9, -1, -1),
}
)
# it works! #2605
grouped = df.groupby(["name", "name2"])
grouped.apply(lambda x: x.sort_values("value", inplace=True))
def test_apply_typecast_fail():
df = DataFrame(
{
"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
"c": np.tile(["a", "b", "c"], 2),
"v": np.arange(1.0, 7.0),
}
)
def f(group):
v = group["v"]
group["v2"] = (v - v.min()) / (v.max() - v.min())
return group
result = df.groupby("d").apply(f)
expected = df.copy()
expected["v2"] = np.tile([0.0, 0.5, 1], 2)
tm.assert_frame_equal(result, expected)
def test_apply_multiindex_fail():
index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
df = DataFrame(
{
"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
"c": np.tile(["a", "b", "c"], 2),
"v": np.arange(1.0, 7.0),
},
index=index,
)
def f(group):
v = group["v"]
group["v2"] = (v - v.min()) / (v.max() - v.min())
return group
result = df.groupby("d").apply(f)
expected = df.copy()
expected["v2"] = np.tile([0.0, 0.5, 1], 2)
tm.assert_frame_equal(result, expected)
def test_apply_corner(tsframe):
result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
expected = tsframe * 2
tm.assert_frame_equal(result, expected)
def test_apply_without_copy():
# GH 5545
# returning a non-copy in an applied function fails
data = DataFrame(
{
"id_field": [100, 100, 200, 300],
"category": ["a", "b", "c", "c"],
"value": [1, 2, 3, 4],
}
)
def filt1(x):
if x.shape[0] == 1:
return x.copy()
else:
return x[x.category == "c"]
def filt2(x):
if x.shape[0] == 1:
return x
else:
return x[x.category == "c"]
expected = data.groupby("id_field").apply(filt1)
result = data.groupby("id_field").apply(filt2)
tm.assert_frame_equal(result, expected)
def test_apply_corner_cases():
# #535, can't use sliding iterator
N = 1000
labels = np.random.randint(0, 100, size=N)
df = DataFrame(
{
"key": labels,
"value1": np.random.randn(N),
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
}
)
grouped = df.groupby("key")
def f(g):
g["value3"] = g["value1"] * 2
return g
result = grouped.apply(f)
assert "value3" in result
def test_apply_numeric_coercion_when_datetime():
# In the past, group-by/apply operations have been over-eager
# in converting dtypes to numeric, in the presence of datetime
# columns. Various GH issues were filed, the reproductions
# for which are here.
# GH 15670
df = pd.DataFrame(
{"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]}
)
expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
df.Date = pd.to_datetime(df.Date)
result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
tm.assert_series_equal(result["Str"], expected["Str"])
# GH 15421
df = pd.DataFrame(
{"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
)
def get_B(g):
return g.iloc[0][["B"]]
result = df.groupby("A").apply(get_B)["B"]
expected = df.B
expected.index = df.A
tm.assert_series_equal(result, expected)
# GH 14423
def predictions(tool):
out = pd.Series(index=["p1", "p2", "useTime"], dtype=object)
if "step1" in list(tool.State):
out["p1"] = str(tool[tool.State == "step1"].Machine.values[0])
if "step2" in list(tool.State):
out["p2"] = str(tool[tool.State == "step2"].Machine.values[0])
out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0])
return out
df1 = pd.DataFrame(
{
"Key": ["B", "B", "A", "A"],
"State": ["step1", "step2", "step1", "step2"],
"oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"],
"Machine": ["23", "36L", "36R", "36R"],
}
)
df2 = df1.copy()
df2.oTime = pd.to_datetime(df2.oTime)
expected = df1.groupby("Key").apply(predictions).p1
result = df2.groupby("Key").apply(predictions).p1
tm.assert_series_equal(expected, result)
def test_time_field_bug():
# Test a fix for the following error related to GH issue 11324 When
# non-key fields in a group-by dataframe contained time-based fields
# that were not returned by the apply function, an exception would be
# raised.
df = pd.DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]})
def func_with_no_date(batch):
return pd.Series({"c": 2})
def func_with_date(batch):
return pd.Series({"b": datetime(2015, 1, 1), "c": 2})
dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
dfg_no_conversion_expected = pd.DataFrame({"c": 2}, index=[1])
dfg_no_conversion_expected.index.name = "a"
dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
dfg_conversion_expected = pd.DataFrame(
{"b": datetime(2015, 1, 1), "c": 2}, index=[1]
)
dfg_conversion_expected.index.name = "a"
tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
def test_gb_apply_list_of_unequal_len_arrays():
# GH1738
df = DataFrame(
{
"group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"],
"group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"],
"weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
"value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3],
}
)
df = df.set_index(["group1", "group2"])
df_grouped = df.groupby(level=["group1", "group2"], sort=True)
def noddy(value, weight):
out = np.array(value * weight).repeat(3)
return out
# the kernel function returns arrays of unequal length
# pandas sniffs the first one, sees it's an array and not
# a list, and assumed the rest are of equal length
# and so tries a vstack
# don't die
df_grouped.apply(lambda x: noddy(x.value, x.weight))
def test_groupby_apply_all_none():
# Tests to make sure no errors if apply function returns all None
# values. Issue 9684.
test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]})
def test_func(x):
pass
result = test_df.groupby("groups").apply(test_func)
expected = DataFrame()
tm.assert_frame_equal(result, expected)
def test_groupby_apply_none_first():
# GH 12824. Tests if apply returns None first.
test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
def test_func(x):
if x.shape[0] < 2:
return None
return x.iloc[[0, -1]]
result1 = test_df1.groupby("groups").apply(test_func)
result2 = test_df2.groupby("groups").apply(test_func)
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
tm.assert_frame_equal(result1, expected1)
tm.assert_frame_equal(result2, expected2)
def test_groupby_apply_return_empty_chunk():
# GH 22221: apply filter which returns some empty groups
df = pd.DataFrame(dict(value=[0, 1], group=["filled", "empty"]))
groups = df.groupby("group")
result = groups.apply(lambda group: group[group.value != 1]["value"])
expected = pd.Series(
[0],
name="value",
index=MultiIndex.from_product(
[["empty", "filled"], [0]], names=["group", None]
).drop("empty"),
)
tm.assert_series_equal(result, expected)
def test_apply_with_mixed_types():
# gh-20949
df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
g = df.groupby("A")
result = g.transform(lambda x: x / x.sum())
expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
tm.assert_frame_equal(result, expected)
result = g.apply(lambda x: x / x.sum())
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,152 @@
import numpy as np
from numpy import nan
import pytest
from pandas._libs import groupby, lib, reduction
from pandas.core.dtypes.common import ensure_int64
from pandas import Index, isna
from pandas.core.groupby.ops import generate_bins_generic
import pandas.util.testing as tm
from pandas.util.testing import assert_almost_equal
def test_series_grouper():
from pandas import Series
obj = Series(np.random.randn(10))
dummy = obj[:0]
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)
grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy)
result, counts = grouper.get_result()
expected = np.array([obj[3:6].mean(), obj[6:].mean()])
assert_almost_equal(result, expected)
exp_counts = np.array([3, 4], dtype=np.int64)
assert_almost_equal(counts, exp_counts)
def test_series_bin_grouper():
from pandas import Series
obj = Series(np.random.randn(10))
dummy = obj[:0]
bins = np.array([3, 6])
grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy)
result, counts = grouper.get_result()
expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()])
assert_almost_equal(result, expected)
exp_counts = np.array([3, 3, 4], dtype=np.int64)
assert_almost_equal(counts, exp_counts)
class TestBinGroupers:
def setup_method(self, method):
self.obj = np.random.randn(10, 1)
self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64)
self.bins = np.array([3, 6], dtype=np.int64)
def test_generate_bins(self):
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
binner = np.array([0, 3, 6, 9], dtype=np.int64)
for func in [lib.generate_bins_dt64, generate_bins_generic]:
bins = func(values, binner, closed="left")
assert (bins == np.array([2, 5, 6])).all()
bins = func(values, binner, closed="right")
assert (bins == np.array([3, 6, 6])).all()
for func in [lib.generate_bins_dt64, generate_bins_generic]:
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
binner = np.array([0, 3, 6], dtype=np.int64)
bins = func(values, binner, closed="right")
assert (bins == np.array([3, 6])).all()
msg = "Invalid length for values or for binner"
with pytest.raises(ValueError, match=msg):
generate_bins_generic(values, [], "right")
with pytest.raises(ValueError, match=msg):
generate_bins_generic(values[:0], binner, "right")
msg = "Values falls before first bin"
with pytest.raises(ValueError, match=msg):
generate_bins_generic(values, [4], "right")
msg = "Values falls after last bin"
with pytest.raises(ValueError, match=msg):
generate_bins_generic(values, [-3, -1], "right")
def test_group_ohlc():
def _check(dtype):
obj = np.array(np.random.randn(20), dtype=dtype)
bins = np.array([6, 12, 20])
out = np.zeros((3, 4), dtype)
counts = np.zeros(len(out), dtype=np.int64)
labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
func = getattr(groupby, "group_ohlc_{dtype}".format(dtype=dtype))
func(out, counts, obj[:, None], labels)
def _ohlc(group):
if isna(group).all():
return np.repeat(nan, 4)
return [group[0], group.max(), group.min(), group[-1]]
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
assert_almost_equal(out, expected)
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
obj[:6] = nan
func(out, counts, obj[:, None], labels)
expected[0] = nan
assert_almost_equal(out, expected)
_check("float32")
_check("float64")
class TestMoments:
pass
class TestReducer:
def test_int_index(self):
from pandas.core.series import Series
arr = np.random.randn(100, 4)
result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4)))
expected = arr.sum(0)
assert_almost_equal(result, expected)
result = reduction.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100)))
expected = arr.sum(1)
assert_almost_equal(result, expected)
dummy = Series(0.0, index=np.arange(100))
result = reduction.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4)))
expected = arr.sum(0)
assert_almost_equal(result, expected)
dummy = Series(0.0, index=np.arange(4))
result = reduction.reduce(
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
)
expected = arr.sum(1)
assert_almost_equal(result, expected)
result = reduction.reduce(
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
)
assert_almost_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,225 @@
from itertools import product
import numpy as np
import pytest
from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestCounting:
def test_cumcount(self):
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3])
assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())
def test_cumcount_empty(self):
ge = DataFrame().groupby(level=0)
se = Series().groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
assert_series_equal(e, ge.cumcount())
assert_series_equal(e, se.cumcount())
def test_cumcount_dupe_index(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())
def test_cumcount_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=mi)
assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())
def test_cumcount_groupby_not_col(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
assert_series_equal(expected, g.cumcount())
assert_series_equal(expected, sg.cumcount())
def test_ngroup(self):
df = DataFrame({"A": list("aaaba")})
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0])
assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())
def test_ngroup_distinct(self):
df = DataFrame({"A": list("abcde")})
g = df.groupby("A")
sg = g.A
expected = Series(range(5), dtype="int64")
assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())
def test_ngroup_one_group(self):
df = DataFrame({"A": [0] * 5})
g = df.groupby("A")
sg = g.A
expected = Series([0] * 5)
assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())
def test_ngroup_empty(self):
ge = DataFrame().groupby(level=0)
se = Series().groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
assert_series_equal(e, ge.ngroup())
assert_series_equal(e, se.ngroup())
def test_ngroup_series_matches_frame(self):
df = DataFrame({"A": list("aaaba")})
s = Series(list("aaaba"))
assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
def test_ngroup_dupe_index(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())
def test_ngroup_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame({"A": list("aaaba")}, index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=mi)
assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())
def test_ngroup_groupby_not_col(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
assert_series_equal(expected, g.ngroup())
assert_series_equal(expected, sg.ngroup())
def test_ngroup_descending(self):
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
g = df.groupby(["A"])
ascending = Series([0, 0, 1, 0, 1])
descending = Series([1, 1, 0, 1, 0])
assert_series_equal(descending, (g.ngroups - 1) - ascending)
assert_series_equal(ascending, g.ngroup(ascending=True))
assert_series_equal(descending, g.ngroup(ascending=False))
def test_ngroup_matches_cumcount(self):
# verify one manually-worked out case works
df = DataFrame(
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
columns=["A", "X"],
)
g = df.groupby(["A", "X"])
g_ngroup = g.ngroup()
g_cumcount = g.cumcount()
expected_ngroup = Series([0, 1, 2, 0, 3])
expected_cumcount = Series([0, 0, 0, 1, 0])
assert_series_equal(g_ngroup, expected_ngroup)
assert_series_equal(g_cumcount, expected_cumcount)
def test_ngroup_cumcount_pair(self):
# brute force comparison for all small series
for p in product(range(3), repeat=4):
df = DataFrame({"a": p})
g = df.groupby(["a"])
order = sorted(set(p))
ngroupd = [order.index(val) for val in p]
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
assert_series_equal(g.ngroup(), Series(ngroupd))
assert_series_equal(g.cumcount(), Series(cumcounted))
def test_ngroup_respects_groupby_order(self):
np.random.seed(0)
df = DataFrame({"a": np.random.choice(list("abcdef"), 100)})
for sort_flag in (False, True):
g = df.groupby(["a"], sort=sort_flag)
df["group_id"] = -1
df["group_index"] = -1
for i, (_, group) in enumerate(g):
df.loc[group.index, "group_id"] = i
for j, ind in enumerate(group.index):
df.loc[ind, "group_index"] = j
assert_series_equal(Series(df["group_id"].values), g.ngroup())
assert_series_equal(Series(df["group_index"].values), g.cumcount())
@pytest.mark.parametrize(
"datetimelike",
[
[
Timestamp("2016-05-{i:02d} 20:09:25+00:00".format(i=i))
for i in range(1, 4)
],
[Timestamp("2016-05-{i:02d} 20:09:25".format(i=i)) for i in range(1, 4)],
[Timedelta(x, unit="h") for x in range(1, 4)],
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
],
)
def test_count_with_datetimelike(self, datetimelike):
# test for #13393, where DataframeGroupBy.count() fails
# when counting a datetimelike column.
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
res = df.groupby("x").count()
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
expected.index.name = "x"
assert_frame_equal(expected, res)
def test_count_with_only_nans_in_first_group(self):
# GH21956
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
result = df.groupby(["A", "B"]).C.count()
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
expected = Series([], index=mi, dtype=np.int64, name="C")
assert_series_equal(result, expected, check_index_type=False)

View File

@@ -0,0 +1,597 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import pandas.util.testing as tm
def test_filter_series():
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = pd.Series([20, 22, 24], index=[2, 4, 5])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(s.index),
)
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(s.index),
)
def test_filter_single_column_df():
df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7])
expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5])
grouper = df[0].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(df.index),
)
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(df.index),
)
def test_filter_multi_column_df():
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = pd.DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
tm.assert_frame_equal(
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
)
def test_filter_mixed_df():
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = pd.DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
def test_filter_out_all_groups():
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
def test_filter_out_no_groups():
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)
df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
tm.assert_frame_equal(filtered, df)
def test_filter_out_all_groups_in_df():
# GH12768
df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
expected = pd.DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)
df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
expected = pd.DataFrame({"a": [], "b": []}, dtype="int64")
tm.assert_frame_equal(expected, res)
def test_filter_condition_raises():
def raise_if_sum_is_zero(x):
if x.sum() == 0:
raise ValueError
else:
return x.sum() > 0
s = pd.Series([-1, 0, 1, 2])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
grouped.filter(raise_if_sum_is_zero)
def test_filter_with_axis_in_groupby():
# issue 11041
index = pd.MultiIndex.from_product([range(10), [0, 1]])
data = pd.DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10)
expected = data.iloc[:, 12:20]
tm.assert_frame_equal(result, expected)
def test_filter_bad_shapes():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby("B")
g_s = s.groupby(s)
f = lambda x: x
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: x == 1
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: np.outer(x, x)
msg = "can't multiply sequence by non-int of type 'str'"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
def test_filter_nan_is_false():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby(df["B"])
g_s = s.groupby(s)
f = lambda x: np.nan
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
tm.assert_series_equal(g_s.filter(f), s[[]])
def test_filter_against_workaround():
np.random.seed(0)
# Series of ints
s = Series(np.random.randint(0, 100, 1000))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
# Series of floats
s = 100 * Series(np.random.random(1000))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
# Set up DataFrame of ints, floats, strings.
from string import ascii_lowercase
letters = np.array(list(ascii_lowercase))
N = 1000
random_letters = letters.take(np.random.randint(0, 26, N))
df = DataFrame(
{
"ints": Series(np.random.randint(0, 100, N)),
"floats": N / 10 * Series(np.random.random(N)),
"letters": Series(random_letters),
}
)
# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
tm.assert_frame_equal(new_way, old_way)
# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
def test_filter_using_len():
# BUG GH4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
expected = DataFrame(
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
index=np.arange(2, 6),
)
tm.assert_frame_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)
# Series have always worked properly, but we'll test anyway.
s = df["B"]
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=np.arange(2, 6), name="B")
tm.assert_series_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = s[[]]
tm.assert_series_equal(actual, expected)
def test_filter_maintains_ordering():
# Simple case: index is sequential. #4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Now index is sequentially decreasing.
df.index = np.arange(len(df) - 1, -1, -1)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Index is shuffled.
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
df.index = df.index[SHUFFLED]
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
def test_filter_multiple_timestamp():
# GH 10114
df = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": ["foo", "bar", "foo", "bar", "bar"],
"C": Timestamp("20130101"),
}
)
grouped = df.groupby(["B", "C"])
result = grouped["A"].filter(lambda x: True)
tm.assert_series_equal(df["A"], result)
result = grouped["A"].transform(len)
expected = Series([2, 3, 2, 3, 3], name="A")
tm.assert_series_equal(result, expected)
result = grouped.filter(lambda x: True)
tm.assert_frame_equal(df, result)
result = grouped.transform("sum")
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
tm.assert_frame_equal(result, expected)
result = grouped.transform(len)
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
tm.assert_frame_equal(result, expected)
def test_filter_and_transform_with_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 1, 1, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_multiple_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 0, 0, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_float_index():
# GH4620
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_timestamp_index():
# GH4620
t0 = Timestamp("2013-09-30 00:05:00")
t1 = Timestamp("2013-10-30 00:05:00")
t2 = Timestamp("2013-11-30 00:05:00")
index = [t1, t1, t1, t2, t1, t1, t0, t1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_string_index():
# GH4620
index = list("bbbcbbab")
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_has_access_to_grouped_cols():
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
# previously didn't have access to col A #????
filt = g.filter(lambda x: x["A"].sum() == 2)
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
def test_filter_enforces_scalarness():
df = pd.DataFrame(
[
["best", "a", "x"],
["worst", "b", "y"],
["best", "c", "x"],
["best", "d", "y"],
["worst", "d", "y"],
["worst", "d", "y"],
["best", "d", "z"],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("c").filter(lambda g: g["a"] == "best")
def test_filter_non_bool_raises():
df = pd.DataFrame(
[
["best", "a", 1],
["worst", "b", 1],
["best", "c", 1],
["best", "d", 1],
["worst", "d", 1],
["worst", "d", 1],
["best", "d", 1],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("a").filter(lambda g: g.c.mean())
def test_filter_dropna_with_empty_groups():
# GH 10780
data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
groupped = data.groupby(level=0)
result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
expected_false = pd.Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
tm.assert_series_equal(result_false, expected_false)
result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
expected_true = pd.Series(index=pd.Index([], dtype=int))
tm.assert_series_equal(result_true, expected_true)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,905 @@
""" test where we are determining what we are grouping, or getting groups """
import numpy as np
import pytest
import pandas as pd
from pandas import (
CategoricalIndex,
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
)
from pandas.core.groupby.grouper import Grouping
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal,
assert_frame_equal,
assert_series_equal,
)
# selection
# --------------------------------
class TestSelection:
def test_select_bad_cols(self):
df = DataFrame([[1, 2]], columns=["A", "B"])
g = df.groupby("A")
with pytest.raises(KeyError, match="\"Columns not found: 'C'\""):
g[["C"]]
with pytest.raises(KeyError, match="^[^A]+$"):
# A should not be referenced as a bad column...
# will have to rethink regex if you change message!
g[["A", "C"]]
def test_groupby_duplicated_column_errormsg(self):
# GH7511
df = DataFrame(
columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)]
)
msg = "Grouper for 'A' not 1-dimensional"
with pytest.raises(ValueError, match=msg):
df.groupby("A")
with pytest.raises(ValueError, match=msg):
df.groupby(["A", "B"])
grouped = df.groupby("B")
c = grouped.count()
assert c.columns.nlevels == 1
assert c.columns.size == 3
def test_column_select_via_attr(self, df):
result = df.groupby("A").C.sum()
expected = df.groupby("A")["C"].sum()
assert_series_equal(result, expected)
df["mean"] = 1.5
result = df.groupby("A").mean()
expected = df.groupby("A").agg(np.mean)
assert_frame_equal(result, expected)
def test_getitem_list_of_columns(self):
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
"E": np.random.randn(8),
}
)
result = df.groupby("A")[["C", "D"]].mean()
result2 = df.groupby("A")["C", "D"].mean()
result3 = df.groupby("A")[df.columns[2:4]].mean()
expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
assert_frame_equal(result, expected)
assert_frame_equal(result2, expected)
assert_frame_equal(result3, expected)
def test_getitem_numeric_column_names(self):
# GH #13731
df = DataFrame(
{
0: list("abcd") * 2,
2: np.random.randn(8),
4: np.random.randn(8),
6: np.random.randn(8),
}
)
result = df.groupby(0)[df.columns[1:3]].mean()
result2 = df.groupby(0)[2, 4].mean()
result3 = df.groupby(0)[[2, 4]].mean()
expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
assert_frame_equal(result, expected)
assert_frame_equal(result2, expected)
assert_frame_equal(result3, expected)
# grouping
# --------------------------------
class TestGrouping:
def test_grouper_index_types(self):
# related GH5375
# groupby misbehaving when using a Floatlike index
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
for index in [
tm.makeFloatIndex,
tm.makeStringIndex,
tm.makeUnicodeIndex,
tm.makeIntIndex,
tm.makeDateIndex,
tm.makePeriodIndex,
]:
df.index = index(len(df))
df.groupby(list("abcde")).apply(lambda x: x)
df.index = list(reversed(df.index.tolist()))
df.groupby(list("abcde")).apply(lambda x: x)
def test_grouper_multilevel_freq(self):
# GH 7885
# with level and freq specified in a pd.Grouper
from datetime import date, timedelta
d0 = date.today() - timedelta(days=14)
dates = date_range(d0, date.today())
date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"])
df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)
# Check string level
expected = (
df.reset_index()
.groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")])
.sum()
)
# reset index changes columns dtype to object
expected.columns = pd.Index([0], dtype="int64")
result = df.groupby(
[pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")]
).sum()
assert_frame_equal(result, expected)
# Check integer level
result = df.groupby(
[pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")]
).sum()
assert_frame_equal(result, expected)
def test_grouper_creation_bug(self):
# GH 8795
df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
g = df.groupby("A")
expected = g.sum()
g = df.groupby(pd.Grouper(key="A"))
result = g.sum()
assert_frame_equal(result, expected)
result = g.apply(lambda x: x.sum())
assert_frame_equal(result, expected)
g = df.groupby(pd.Grouper(key="A", axis=0))
result = g.sum()
assert_frame_equal(result, expected)
# GH14334
# pd.Grouper(key=...) may be passed in a list
df = DataFrame(
{"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
)
# Group by single column
expected = df.groupby("A").sum()
g = df.groupby([pd.Grouper(key="A")])
result = g.sum()
assert_frame_equal(result, expected)
# Group by two columns
# using a combination of strings and Grouper objects
expected = df.groupby(["A", "B"]).sum()
# Group with two Grouper objects
g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
result = g.sum()
assert_frame_equal(result, expected)
# Group with a string and a Grouper object
g = df.groupby(["A", pd.Grouper(key="B")])
result = g.sum()
assert_frame_equal(result, expected)
# Group with a Grouper object and a string
g = df.groupby([pd.Grouper(key="A"), "B"])
result = g.sum()
assert_frame_equal(result, expected)
# GH8866
s = Series(
np.arange(8, dtype="int64"),
index=pd.MultiIndex.from_product(
[list("ab"), range(2), date_range("20130101", periods=2)],
names=["one", "two", "three"],
),
)
result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
expected = Series(
[28], index=Index([Timestamp("2013-01-31")], freq="M", name="three")
)
assert_series_equal(result, expected)
# just specifying a level breaks
result = s.groupby(pd.Grouper(level="one")).sum()
expected = s.groupby(level="one").sum()
assert_series_equal(result, expected)
def test_grouper_column_and_index(self):
# GH 14327
# Grouping a multi-index frame by a column and an index level should
# be equivalent to resetting the index and grouping by two columns
idx = pd.MultiIndex.from_tuples(
[("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
)
idx.names = ["outer", "inner"]
df_multi = pd.DataFrame(
{"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
index=idx,
)
result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean()
expected = df_multi.reset_index().groupby(["B", "inner"]).mean()
assert_frame_equal(result, expected)
# Test the reverse grouping order
result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean()
expected = df_multi.reset_index().groupby(["inner", "B"]).mean()
assert_frame_equal(result, expected)
# Grouping a single-index frame by a column and the index should
# be equivalent to resetting the index and grouping by two columns
df_single = df_multi.reset_index("outer")
result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean()
expected = df_single.reset_index().groupby(["B", "inner"]).mean()
assert_frame_equal(result, expected)
# Test the reverse grouping order
result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean()
expected = df_single.reset_index().groupby(["inner", "B"]).mean()
assert_frame_equal(result, expected)
def test_groupby_levels_and_columns(self):
# GH9344, GH9049
idx_names = ["x", "y"]
idx = pd.MultiIndex.from_tuples(
[(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names
)
df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
by_levels = df.groupby(level=idx_names).mean()
# reset_index changes columns dtype to object
by_columns = df.reset_index().groupby(idx_names).mean()
tm.assert_frame_equal(by_levels, by_columns, check_column_type=False)
by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
tm.assert_frame_equal(by_levels, by_columns)
def test_groupby_categorical_index_and_columns(self, observed):
# GH18432, adapted for GH25871
columns = ["A", "B", "A", "B"]
categories = ["B", "A"]
data = np.array(
[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
)
cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
df = DataFrame(data=data, columns=cat_columns)
result = df.groupby(axis=1, level=0, observed=observed).sum()
expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
expected_columns = CategoricalIndex(
categories, categories=categories, ordered=True
)
expected = DataFrame(data=expected_data, columns=expected_columns)
assert_frame_equal(result, expected)
# test transposed version
df = DataFrame(data.T, index=cat_columns)
result = df.groupby(axis=0, level=0, observed=observed).sum()
expected = DataFrame(data=expected_data.T, index=expected_columns)
assert_frame_equal(result, expected)
def test_grouper_getting_correct_binner(self):
# GH 10063
# using a non-time-based grouper and a time-based grouper
# and specifying levels
df = DataFrame(
{"A": 1},
index=pd.MultiIndex.from_product(
[list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
),
)
result = df.groupby(
[pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")]
).sum()
expected = DataFrame(
{"A": [31, 28, 21, 31, 28, 21]},
index=MultiIndex.from_product(
[list("ab"), date_range("20130101", freq="M", periods=3)],
names=["one", "two"],
),
)
assert_frame_equal(result, expected)
def test_grouper_iter(self, df):
assert sorted(df.groupby("A").grouper) == ["bar", "foo"]
def test_empty_groups(self, df):
# see gh-1048
with pytest.raises(ValueError, match="No group keys passed!"):
df.groupby([])
def test_groupby_grouper(self, df):
grouped = df.groupby("A")
result = df.groupby(grouped.grouper).mean()
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_groupby_dict_mapping(self):
# GH #679
from pandas import Series
s = Series({"T1": 5})
result = s.groupby({"T1": "T2"}).agg(sum)
expected = s.groupby(["T2"]).agg(sum)
assert_series_equal(result, expected)
s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
mapping = {"a": 0, "b": 0, "c": 1, "d": 1}
result = s.groupby(mapping).mean()
result2 = s.groupby(mapping).agg(np.mean)
expected = s.groupby([0, 0, 1, 1]).mean()
expected2 = s.groupby([0, 0, 1, 1]).mean()
assert_series_equal(result, expected)
assert_series_equal(result, result2)
assert_series_equal(result, expected2)
def test_groupby_grouper_f_sanity_checked(self):
dates = date_range("01-Jan-2013", periods=12, freq="MS")
ts = Series(np.random.randn(12), index=dates)
# GH3035
# index.map is used to apply grouper to the index
# if it fails on the elements, map tries it on the entire index as
# a sequence. That can yield invalid results that cause trouble
# down the line.
# the surprise comes from using key[0:6] rather then str(key)[0:6]
# when the elements are Timestamp.
# the result is Index[0:6], very confusing.
msg = r"Grouper result violates len\(labels\) == len\(data\)"
with pytest.raises(AssertionError, match=msg):
ts.groupby(lambda key: key[0:6])
def test_grouping_error_on_multidim_input(self, df):
msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
with pytest.raises(ValueError, match=msg):
Grouping(df.index, df[["A", "A"]])
def test_multiindex_passthru(self):
# GH 7997
# regression from 0.14.1
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
result = df.groupby(axis=1, level=[0, 1]).first()
assert_frame_equal(result, df)
def test_multiindex_negative_level(self, mframe):
# GH 13901
result = mframe.groupby(level=-1).sum()
expected = mframe.groupby(level="second").sum()
assert_frame_equal(result, expected)
result = mframe.groupby(level=-2).sum()
expected = mframe.groupby(level="first").sum()
assert_frame_equal(result, expected)
result = mframe.groupby(level=[-2, -1]).sum()
expected = mframe
assert_frame_equal(result, expected)
result = mframe.groupby(level=[-1, "first"]).sum()
expected = mframe.groupby(level=["second", "first"]).sum()
assert_frame_equal(result, expected)
def test_multifunc_select_col_integer_cols(self, df):
df.columns = np.arange(len(df.columns))
# it works!
df.groupby(1, as_index=False)[2].agg({"Q": np.mean})
def test_multiindex_columns_empty_level(self):
lst = [["count", "values"], ["to filter", ""]]
midx = MultiIndex.from_tuples(lst)
df = DataFrame([[1, "A"]], columns=midx)
grouped = df.groupby("to filter").groups
assert grouped["A"] == [0]
grouped = df.groupby([("to filter", "")]).groups
assert grouped["A"] == [0]
df = DataFrame([[1, "A"], [2, "B"]], columns=midx)
expected = df.groupby("to filter").groups
result = df.groupby([("to filter", "")]).groups
assert result == expected
df = DataFrame([[1, "A"], [2, "A"]], columns=midx)
expected = df.groupby("to filter").groups
result = df.groupby([("to filter", "")]).groups
tm.assert_dict_equal(result, expected)
def test_groupby_multiindex_tuple(self):
# GH 17979
df = pd.DataFrame(
[[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
)
expected = df.groupby([("b", 1)]).groups
result = df.groupby(("b", 1)).groups
tm.assert_dict_equal(expected, result)
df2 = pd.DataFrame(
df.values,
columns=pd.MultiIndex.from_arrays(
[["a", "b", "b", "c"], ["d", "d", "e", "e"]]
),
)
expected = df2.groupby([("b", "d")]).groups
result = df.groupby(("b", 1)).groups
tm.assert_dict_equal(expected, result)
df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
expected = df3.groupby([("b", "d")]).groups
result = df.groupby(("b", 1)).groups
tm.assert_dict_equal(expected, result)
@pytest.mark.parametrize("sort", [True, False])
def test_groupby_level(self, sort, mframe, df):
# GH 17537
frame = mframe
deleveled = frame.reset_index()
result0 = frame.groupby(level=0, sort=sort).sum()
result1 = frame.groupby(level=1, sort=sort).sum()
expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()
expected0.index.name = "first"
expected1.index.name = "second"
assert result0.index.name == "first"
assert result1.index.name == "second"
assert_frame_equal(result0, expected0)
assert_frame_equal(result1, expected1)
assert result0.index.name == frame.index.names[0]
assert result1.index.name == frame.index.names[1]
# groupby level name
result0 = frame.groupby(level="first", sort=sort).sum()
result1 = frame.groupby(level="second", sort=sort).sum()
assert_frame_equal(result0, expected0)
assert_frame_equal(result1, expected1)
# axis=1
result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
assert_frame_equal(result0, expected0.T)
assert_frame_equal(result1, expected1.T)
# raise exception for non-MultiIndex
msg = "level > 0 or level < -1 only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
df.groupby(level=1)
def test_groupby_level_index_names(self):
# GH4014 this used to raise ValueError since 'exp'>1 (in py2)
df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
"exp"
)
df.groupby(level="exp")
msg = "level name foo is not the name of the index"
with pytest.raises(ValueError, match=msg):
df.groupby(level="foo")
@pytest.mark.parametrize("sort", [True, False])
def test_groupby_level_with_nas(self, sort):
# GH 17537
index = MultiIndex(
levels=[[1, 0], [0, 1, 2, 3]],
codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
)
# factorizing doesn't confuse things
s = Series(np.arange(8.0), index=index)
result = s.groupby(level=0, sort=sort).sum()
expected = Series([6.0, 22.0], index=[0, 1])
assert_series_equal(result, expected)
index = MultiIndex(
levels=[[1, 0], [0, 1, 2, 3]],
codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
)
# factorizing doesn't confuse things
s = Series(np.arange(8.0), index=index)
result = s.groupby(level=0, sort=sort).sum()
expected = Series([6.0, 18.0], index=[0.0, 1.0])
assert_series_equal(result, expected)
def test_groupby_args(self, mframe):
# PR8618 and issue 8015
frame = mframe
msg = "You have to supply one of 'by' and 'level'"
with pytest.raises(TypeError, match=msg):
frame.groupby()
msg = "You have to supply one of 'by' and 'level'"
with pytest.raises(TypeError, match=msg):
frame.groupby(by=None, level=None)
@pytest.mark.parametrize(
"sort,labels",
[
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
],
)
def test_level_preserve_order(self, sort, labels, mframe):
# GH 17537
grouped = mframe.groupby(level=0, sort=sort)
exp_labels = np.array(labels, np.intp)
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
def test_grouping_labels(self, mframe):
grouped = mframe.groupby(mframe.index.get_level_values(0))
exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
assert_almost_equal(grouped.grouper.labels[0], exp_labels)
def test_list_grouper_with_nat(self):
# GH 14715
df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")})
df.iloc[-1] = pd.NaT
grouper = pd.Grouper(key="date", freq="AS")
# Grouper in a list grouping
result = df.groupby([grouper])
expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))}
tm.assert_dict_equal(result.groups, expected)
# Test case without a list
result = df.groupby(grouper)
expected = {pd.Timestamp("2011-01-01"): 365}
tm.assert_dict_equal(result.groups, expected)
@pytest.mark.parametrize(
"func,expected",
[
("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))),
("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))),
("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))),
],
)
def test_evaluate_with_empty_groups(self, func, expected):
# 26208
# test transform'ing empty groups
# (not testing other agg fns, because they return
# different index objects.
df = pd.DataFrame({1: [], 2: []})
g = df.groupby(1)
result = getattr(g[2], func)(lambda x: x)
assert_series_equal(result, expected)
def test_groupby_empty(self):
# https://github.com/pandas-dev/pandas/issues/27190
s = pd.Series([], name="name")
gr = s.groupby([])
result = gr.mean()
tm.assert_series_equal(result, s)
# check group properties
assert len(gr.grouper.groupings) == 1
tm.assert_numpy_array_equal(
gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64"))
)
tm.assert_numpy_array_equal(
gr.grouper.group_info[1], np.array([], dtype=np.dtype("int"))
)
assert gr.grouper.group_info[2] == 0
# check name
assert s.groupby(s).grouper.names == ["name"]
# get_group
# --------------------------------
class TestGetGroup:
def test_get_group(self):
# GH 5267
# be datelike friendly
df = DataFrame(
{
"DATE": pd.to_datetime(
[
"10-Oct-2013",
"10-Oct-2013",
"10-Oct-2013",
"11-Oct-2013",
"11-Oct-2013",
"11-Oct-2013",
]
),
"label": ["foo", "foo", "bar", "foo", "foo", "bar"],
"VAL": [1, 2, 3, 4, 5, 6],
}
)
g = df.groupby("DATE")
key = list(g.groups)[0]
result1 = g.get_group(key)
result2 = g.get_group(Timestamp(key).to_pydatetime())
result3 = g.get_group(str(Timestamp(key)))
assert_frame_equal(result1, result2)
assert_frame_equal(result1, result3)
g = df.groupby(["DATE", "label"])
key = list(g.groups)[0]
result1 = g.get_group(key)
result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
result3 = g.get_group((str(Timestamp(key[0])), key[1]))
assert_frame_equal(result1, result2)
assert_frame_equal(result1, result3)
# must pass a same-length tuple with multiple keys
msg = "must supply a tuple to get_group with multiple grouping keys"
with pytest.raises(ValueError, match=msg):
g.get_group("foo")
with pytest.raises(ValueError, match=msg):
g.get_group(("foo"))
msg = (
"must supply a same-length tuple to get_group with multiple"
" grouping keys"
)
with pytest.raises(ValueError, match=msg):
g.get_group(("foo", "bar", "baz"))
def test_get_group_empty_bins(self, observed):
d = pd.DataFrame([3, 1, 7, 6])
bins = [0, 5, 10, 15]
g = d.groupby(pd.cut(d[0], bins), observed=observed)
# TODO: should prob allow a str of Interval work as well
# IOW '(0, 5]'
result = g.get_group(pd.Interval(0, 5))
expected = DataFrame([3, 1], index=[0, 1])
assert_frame_equal(result, expected)
msg = r"Interval\(10, 15, closed='right'\)"
with pytest.raises(KeyError, match=msg):
g.get_group(pd.Interval(10, 15))
def test_get_group_grouped_by_tuple(self):
# GH 8121
df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T
gr = df.groupby("ids")
expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2])
result = gr.get_group((1,))
assert_frame_equal(result, expected)
dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"])
df = DataFrame({"ids": [(x,) for x in dt]})
gr = df.groupby("ids")
result = gr.get_group(("2010-01-01",))
expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2])
assert_frame_equal(result, expected)
def test_groupby_with_empty(self):
index = pd.DatetimeIndex(())
data = ()
series = pd.Series(data, index)
grouper = pd.Grouper(freq="D")
grouped = series.groupby(grouper)
assert next(iter(grouped), None) is None
def test_groupby_with_single_column(self):
df = pd.DataFrame({"a": list("abssbab")})
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
# GH 13530
exp = pd.DataFrame(index=pd.Index(["a", "b", "s"], name="a"))
tm.assert_frame_equal(df.groupby("a").count(), exp)
tm.assert_frame_equal(df.groupby("a").sum(), exp)
tm.assert_frame_equal(df.groupby("a").nth(1), exp)
def test_gb_key_len_equal_axis_len(self):
# GH16843
# test ensures that index and column keys are recognized correctly
# when number of keys equals axis length of groupby
df = pd.DataFrame(
[["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]],
columns=["first", "second", "third", "one"],
)
df = df.set_index(["first", "second"])
df = df.groupby(["first", "second", "third"]).size()
assert df.loc[("foo", "bar", "B")] == 2
assert df.loc[("foo", "baz", "C")] == 1
# groups & iteration
# --------------------------------
class TestIteration:
def test_groups(self, df):
grouped = df.groupby(["A"])
groups = grouped.groups
assert groups is grouped.groups # caching works
for k, v in grouped.groups.items():
assert (df.loc[v]["A"] == k).all()
grouped = df.groupby(["A", "B"])
groups = grouped.groups
assert groups is grouped.groups # caching works
for k, v in grouped.groups.items():
assert (df.loc[v]["A"] == k[0]).all()
assert (df.loc[v]["B"] == k[1]).all()
def test_grouping_is_iterable(self, tsframe):
# this code path isn't used anywhere else
# not sure it's useful
grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
# test it works
for g in grouped.grouper.groupings[0]:
pass
def test_multi_iter(self):
s = Series(np.arange(6))
k1 = np.array(["a", "a", "a", "b", "b", "b"])
k2 = np.array(["1", "2", "1", "2", "1", "2"])
grouped = s.groupby([k1, k2])
iterated = list(grouped)
expected = [
("a", "1", s[[0, 2]]),
("a", "2", s[[1]]),
("b", "1", s[[4]]),
("b", "2", s[[3, 5]]),
]
for i, ((one, two), three) in enumerate(iterated):
e1, e2, e3 = expected[i]
assert e1 == one
assert e2 == two
assert_series_equal(three, e3)
def test_multi_iter_frame(self, three_group):
k1 = np.array(["b", "b", "b", "a", "a", "a"])
k2 = np.array(["1", "2", "1", "2", "1", "2"])
df = DataFrame(
{"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2},
index=["one", "two", "three", "four", "five", "six"],
)
grouped = df.groupby(["k1", "k2"])
# things get sorted!
iterated = list(grouped)
idx = df.index
expected = [
("a", "1", df.loc[idx[[4]]]),
("a", "2", df.loc[idx[[3, 5]]]),
("b", "1", df.loc[idx[[0, 2]]]),
("b", "2", df.loc[idx[[1]]]),
]
for i, ((one, two), three) in enumerate(iterated):
e1, e2, e3 = expected[i]
assert e1 == one
assert e2 == two
assert_frame_equal(three, e3)
# don't iterate through groups with no data
df["k1"] = np.array(["b", "b", "b", "a", "a", "a"])
df["k2"] = np.array(["1", "1", "1", "2", "2", "2"])
grouped = df.groupby(["k1", "k2"])
groups = {key: gp for key, gp in grouped}
assert len(groups) == 2
# axis = 1
three_levels = three_group.groupby(["A", "B", "C"]).mean()
grouped = three_levels.T.groupby(axis=1, level=(1, 2))
for key, group in grouped:
pass
def test_dictify(self, df):
dict(iter(df.groupby("A")))
dict(iter(df.groupby(["A", "B"])))
dict(iter(df["C"].groupby(df["A"])))
dict(iter(df["C"].groupby([df["A"], df["B"]])))
dict(iter(df.groupby("A")["C"]))
dict(iter(df.groupby(["A", "B"])["C"]))
def test_groupby_with_small_elem(self):
# GH 8542
# length=2
df = pd.DataFrame(
{"event": ["start", "start"], "change": [1234, 5678]},
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
)
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
assert len(grouped.groups) == 2
assert grouped.ngroups == 2
assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups
assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups
res = grouped.get_group((pd.Timestamp("2014-09-30"), "start"))
tm.assert_frame_equal(res, df.iloc[[0], :])
res = grouped.get_group((pd.Timestamp("2013-10-31"), "start"))
tm.assert_frame_equal(res, df.iloc[[1], :])
df = pd.DataFrame(
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
)
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
assert len(grouped.groups) == 2
assert grouped.ngroups == 2
assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups
assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups
res = grouped.get_group((pd.Timestamp("2014-09-30"), "start"))
tm.assert_frame_equal(res, df.iloc[[0, 2], :])
res = grouped.get_group((pd.Timestamp("2013-10-31"), "start"))
tm.assert_frame_equal(res, df.iloc[[1], :])
# length=3
df = pd.DataFrame(
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
)
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
assert len(grouped.groups) == 3
assert grouped.ngroups == 3
assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups
assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups
assert (pd.Timestamp("2014-08-31"), "start") in grouped.groups
res = grouped.get_group((pd.Timestamp("2014-09-30"), "start"))
tm.assert_frame_equal(res, df.iloc[[0], :])
res = grouped.get_group((pd.Timestamp("2013-10-31"), "start"))
tm.assert_frame_equal(res, df.iloc[[1], :])
res = grouped.get_group((pd.Timestamp("2014-08-31"), "start"))
tm.assert_frame_equal(res, df.iloc[[2], :])
def test_grouping_string_repr(self):
# GH 13394
mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
df = DataFrame([[1, 2, 3]], columns=mi)
gr = df.groupby(df[("A", "a")])
result = gr.grouper.groupings[0].__repr__()
expected = "Grouping(('A', 'a'))"
assert result == expected

View File

@@ -0,0 +1,82 @@
import numpy as np
import pytest
import pandas as pd
from pandas.util.testing import assert_frame_equal, assert_series_equal
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
def frame(request):
levels = request.param
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
if levels:
df = df.set_index(levels)
return df
@pytest.fixture()
def series():
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
s = df.set_index(["outer", "inner", "B"])["A"]
return s
@pytest.mark.parametrize(
"key_strs,groupers",
[
("inner", pd.Grouper(level="inner")), # Index name
(["inner"], [pd.Grouper(level="inner")]), # List of index name
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
],
)
def test_grouper_index_level_as_string(frame, key_strs, groupers):
result = frame.groupby(key_strs).mean()
expected = frame.groupby(groupers).mean()
assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"levels",
[
"inner",
"outer",
"B",
["inner"],
["outer"],
["B"],
["inner", "outer"],
["outer", "inner"],
["inner", "outer", "B"],
["B", "outer", "inner"],
],
)
def test_grouper_index_level_as_string_series(series, levels):
# Compute expected result
if isinstance(levels, list):
groupers = [pd.Grouper(level=lv) for lv in levels]
else:
groupers = pd.Grouper(level=levels)
expected = series.groupby(groupers).mean()
# Compute and check result
result = series.groupby(levels).mean()
assert_series_equal(result, expected)

View File

@@ -0,0 +1,513 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
from pandas.util.testing import assert_frame_equal, assert_series_equal
def test_first_last_nth(df):
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
assert_frame_equal(first, expected)
nth = grouped.nth(0)
assert_frame_equal(nth, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
assert_frame_equal(last, expected)
nth = grouped.nth(-1)
assert_frame_equal(nth, expected)
nth = grouped.nth(1)
expected = df.loc[[2, 3], ["B", "C", "D"]].copy()
expected.index = Index(["foo", "bar"], name="A")
expected = expected.sort_index()
assert_frame_equal(nth, expected)
# it works!
grouped["B"].first()
grouped["B"].last()
grouped["B"].nth(0)
df.loc[df["A"] == "foo", "B"] = np.nan
assert isna(grouped["B"].first()["foo"])
assert isna(grouped["B"].last()["foo"])
assert isna(grouped["B"].nth(0)["foo"])
# v0.14.0 whatsnew
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
result = g.first()
expected = df.iloc[[1, 2]].set_index("A")
assert_frame_equal(result, expected)
expected = df.iloc[[1, 2]].set_index("A")
result = g.nth(0, dropna="any")
assert_frame_equal(result, expected)
def test_first_last_nth_dtypes(df_mixed_floats):
df = df_mixed_floats.copy()
df["E"] = True
df["F"] = 1
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
assert_frame_equal(first, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
assert_frame_equal(last, expected)
nth = grouped.nth(1)
expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
assert_frame_equal(nth, expected)
# GH 2763, first/last shifting dtypes
idx = list(range(10))
idx.append(9)
s = Series(data=range(11), index=idx, name="IntCol")
assert s.dtype == "int64"
f = s.groupby(level=0).first()
assert f.dtype == "int64"
def test_nth():
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A"))
assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A"))
assert_frame_equal(g.nth(2), df.loc[[]].set_index("A"))
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A"))
assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A"))
assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A"))
assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]])
assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]])
assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A"))
exp = df.set_index("A")
assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]])
exp["B"] = np.nan
assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]])
# out of bounds, regression from 0.13.1
# GH 6621
df = DataFrame(
{
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
"two": {
0: 1.5456590000000001,
1: -0.070345000000000005,
2: -2.4004539999999999,
3: 0.46206000000000003,
4: 0.52350799999999997,
},
"one": {
0: 0.56573799999999996,
1: -0.9742360000000001,
2: 1.033801,
3: -0.78543499999999999,
4: 0.70422799999999997,
},
}
).set_index(["color", "food"])
result = df.groupby(level=0, as_index=False).nth(2)
expected = df.iloc[[-1]]
assert_frame_equal(result, expected)
result = df.groupby(level=0, as_index=False).nth(3)
expected = df.loc[[]]
assert_frame_equal(result, expected)
# GH 7559
# from the vbench
df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64")
s = df[1]
g = df[0]
expected = s.groupby(g).first()
expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
assert_series_equal(expected2, expected, check_names=False)
assert expected.name == 1
assert expected2.name == 1
# validate first
v = s[g == 1].iloc[0]
assert expected.iloc[0] == v
assert expected2.iloc[0] == v
# this is NOT the same as .first (as sorted is default!)
# as it keeps the order in the series (and not the group order)
# related GH 7287
expected = s.groupby(g, sort=False).first()
result = s.groupby(g, sort=False).nth(0, dropna="all")
assert_series_equal(result, expected)
with pytest.raises(ValueError, match="For a DataFrame groupby"):
s.groupby(g, sort=False).nth(0, dropna=True)
# doc example
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
result = g.B.nth(0, dropna="all")
expected = g.B.first()
assert_series_equal(result, expected)
# test multiple nth values
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
g = df.groupby("A")
assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A"))
assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A"))
assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A"))
assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A"))
assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A"))
assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A"))
assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A"))
business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B")
df = DataFrame(1, index=business_dates, columns=["a", "b"])
# get the first, fourth and last two business days for each month
key = [df.index.year, df.index.month]
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(
[
"2014/4/1",
"2014/4/4",
"2014/4/29",
"2014/4/30",
"2014/5/1",
"2014/5/6",
"2014/5/29",
"2014/5/30",
"2014/6/2",
"2014/6/5",
"2014/6/27",
"2014/6/30",
]
)
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
assert_frame_equal(result, expected)
def test_nth_multi_index(three_group):
# PR 9090, related to issue 8979
# test nth on MultiIndex, should match .first()
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = grouped.first()
assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, expected_first, expected_last",
[
(
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
),
(
{
"id": ["A", "B", "A"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
],
"foo": [1, 2, 3],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [1, 2],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [3, 2],
},
),
],
)
def test_first_last_tz(data, expected_first, expected_last):
# GH15884
# Test that the timezone is retained when calling first
# or last on groupby with as_index=False
df = DataFrame(data)
result = df.groupby("id", as_index=False).first()
expected = DataFrame(expected_first)
cols = ["id", "time", "foo"]
assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].first()
assert_frame_equal(result, expected[["id", "time"]])
result = df.groupby("id", as_index=False).last()
expected = DataFrame(expected_last)
cols = ["id", "time", "foo"]
assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].last()
assert_frame_equal(result, expected[["id", "time"]])
@pytest.mark.parametrize(
"method, ts, alpha",
[
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
],
)
def test_first_last_tz_multi_column(method, ts, alpha):
# GH 21603
category_string = pd.Series(list("abc")).astype("category")
df = pd.DataFrame(
{
"group": [1, 1, 2],
"category_string": category_string,
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
}
)
result = getattr(df.groupby("group"), method)()
expected = pd.DataFrame(
{
"category_string": pd.Categorical(
[alpha, "c"], dtype=category_string.dtype
),
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
},
index=pd.Index([1, 2], name="group"),
)
assert_frame_equal(result, expected)
def test_nth_multi_index_as_expected():
# PR 9090, related to issue 8979
# test nth on MultiIndex
three_group = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
}
)
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = DataFrame(
{"C": ["dull", "dull", "dull", "dull"]},
index=MultiIndex.from_arrays(
[["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]],
names=["A", "B"],
),
)
assert_frame_equal(result, expected)
def test_groupby_head_tail():
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
g_as = df.groupby("A", as_index=True)
g_not_as = df.groupby("A", as_index=False)
# as_index= False, much easier
assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
empty_not_as = DataFrame(
columns=df.columns, index=pd.Index([], dtype=df.index.dtype)
)
empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype)
empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype)
assert_frame_equal(empty_not_as, g_not_as.head(0))
assert_frame_equal(empty_not_as, g_not_as.tail(0))
assert_frame_equal(empty_not_as, g_not_as.head(-1))
assert_frame_equal(empty_not_as, g_not_as.tail(-1))
assert_frame_equal(df, g_not_as.head(7)) # contains all
assert_frame_equal(df, g_not_as.tail(7))
# as_index=True, (used to be different)
df_as = df
assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
empty_as["A"] = empty_not_as["A"].astype(df.A.dtype)
empty_as["B"] = empty_not_as["B"].astype(df.B.dtype)
assert_frame_equal(empty_as, g_as.head(0))
assert_frame_equal(empty_as, g_as.tail(0))
assert_frame_equal(empty_as, g_as.head(-1))
assert_frame_equal(empty_as, g_as.tail(-1))
assert_frame_equal(df_as, g_as.head(7)) # contains all
assert_frame_equal(df_as, g_as.tail(7))
# test with selection
assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
assert_frame_equal(g_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]])
assert_frame_equal(g_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]])
assert_frame_equal(g_as[["A", "B"]].head(1), df_as.loc[[0, 2]])
assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
assert_frame_equal(g_not_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]])
assert_frame_equal(g_not_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]])
assert_frame_equal(g_not_as[["A", "B"]].head(1), df_as.loc[[0, 2]])
def test_group_selection_cache():
# GH 12839 nth, head, and tail should return same result consistently
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
expected = df.iloc[[0, 2]].set_index("A")
g = df.groupby("A")
result1 = g.head(n=2)
result2 = g.nth(0)
assert_frame_equal(result1, df)
assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.tail(n=2)
result2 = g.nth(0)
assert_frame_equal(result1, df)
assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.head(n=2)
assert_frame_equal(result1, expected)
assert_frame_equal(result2, df)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.tail(n=2)
assert_frame_equal(result1, expected)
assert_frame_equal(result2, df)
def test_nth_empty():
# GH 16064
df = DataFrame(index=[0], columns=["a", "b", "c"])
result = df.groupby("a").nth(10)
expected = DataFrame(index=Index([], name="a"), columns=["b", "c"])
assert_frame_equal(result, expected)
result = df.groupby(["a", "b"]).nth(10)
expected = DataFrame(
index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"]
)
assert_frame_equal(result, expected)
def test_nth_column_order():
# GH 20760
# Check that nth preserves column order
df = DataFrame(
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
columns=["A", "C", "B"],
)
result = df.groupby("A").nth(0)
expected = DataFrame(
[["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A")
)
assert_frame_equal(result, expected)
result = df.groupby("A").nth(-1, dropna="any")
expected = DataFrame(
[["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A")
)
assert_frame_equal(result, expected)
@pytest.mark.parametrize("dropna", [None, "any", "all"])
def test_nth_nan_in_grouper(dropna):
# GH 26011
df = DataFrame(
[[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]],
columns=list("abc"),
)
result = df.groupby("a").nth(0, dropna=dropna)
expected = pd.DataFrame(
[[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a")
)
assert_frame_equal(result, expected)

View File

@@ -0,0 +1,444 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Series, concat
from pandas.util import testing as tm
def test_rank_apply():
lev1 = tm.rands_array(10, 100)
lev2 = tm.rands_array(10, 130)
lab1 = np.random.randint(0, 100, size=500)
lab2 = np.random.randint(0, 130, size=500)
df = DataFrame(
{
"value": np.random.randn(500),
"key1": lev1.take(lab1),
"key2": lev2.take(lab2),
}
)
result = df.groupby(["key1", "key2"]).value.rank()
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
expected = [
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
[2, 2, 8, 2, 6],
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
],
],
)
@pytest.mark.parametrize(
"ties_method,ascending,pct,exp",
[
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
],
)
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,exp",
[
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
],
)
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
# GH 20561
key = np.repeat(grps, len(vals))
vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
np.nan,
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
np.nan,
np.nan,
],
],
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,pct,exp",
[
(
"average",
True,
"keep",
False,
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
),
(
"average",
True,
"keep",
True,
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
),
(
"average",
False,
"keep",
False,
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
),
(
"average",
False,
"keep",
True,
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
),
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
(
"min",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
(
"max",
False,
"keep",
False,
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
(
"first",
True,
"keep",
False,
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
),
(
"first",
True,
"keep",
True,
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
),
(
"first",
False,
"keep",
False,
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
(
"first",
False,
"keep",
True,
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
),
(
"dense",
True,
"keep",
False,
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
),
(
"dense",
True,
"keep",
True,
[
1.0 / 3.0,
1.0 / 3.0,
np.nan,
3.0 / 3.0,
1.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
(
"dense",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
(
"dense",
False,
"keep",
True,
[
3.0 / 3.0,
3.0 / 3.0,
np.nan,
1.0 / 3.0,
3.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
(
"average",
True,
"bottom",
True,
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
),
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
(
"average",
False,
"bottom",
True,
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
),
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
(
"min",
True,
"bottom",
True,
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
),
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
(
"min",
False,
"bottom",
True,
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
),
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
(
"max",
False,
"bottom",
True,
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
),
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
(
"first",
True,
"bottom",
True,
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
),
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
(
"first",
False,
"bottom",
True,
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
),
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
],
)
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
key = np.repeat(grps, len(vals))
vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
)
def test_rank_resets_each_group(pct, exp):
df = DataFrame(
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
)
result = df.groupby("key").rank(pct=pct)
exp_df = DataFrame(exp * 2, columns=["val"])
tm.assert_frame_equal(result, exp_df)
def test_rank_avg_even_vals():
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
result = df.groupby("key").rank()
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
)
def test_rank_object_raises(ties_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
with pytest.raises(TypeError, match="not callable"):
df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
@pytest.mark.parametrize("na_option", [True, "bad", 1])
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals",
[
["bar", "bar", "foo", "bar", "baz"],
["bar", np.nan, "foo", np.nan, "baz"],
[1, np.nan, 2, np.nan, 3],
],
)
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
def test_rank_empty_group():
# see gh-22519
column = "A"
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
result = df.groupby(column).B.rank(pct=True)
expected = Series([0.5, np.nan, 1.0], name="B")
tm.assert_series_equal(result, expected)
result = df.groupby(column).rank(pct=True)
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_key,input_value,output_value",
[
([1, 2], [1, 1], [1.0, 1.0]),
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
],
)
def test_rank_zero_div(input_key, input_value, output_value):
# GH 23666
df = DataFrame({"A": input_key, "B": input_value})
result = df.groupby("A").rank(method="dense", pct=True)
expected = DataFrame({"B": output_value})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,759 @@
""" test with the TimeGrouper / grouping with datetimes """
from datetime import datetime
from io import StringIO
import numpy as np
from numpy import nan
import pytest
import pytz
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
from pandas.core.groupby.grouper import Grouper
from pandas.core.groupby.ops import BinGrouper
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestGroupBy:
def test_groupby_with_timegrouper(self):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
df_original = DataFrame(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
# GH 6908 change target column's order
df_reordered = df_original.sort_values(by="Quantity")
for df in [df_original, df_reordered]:
df = df.set_index(["Date"])
expected = DataFrame(
{"Quantity": 0},
index=date_range(
"20130901", "20131205", freq="5D", name="Date", closed="left"
),
)
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64")
result1 = df.resample("5D").sum()
assert_frame_equal(result1, expected)
df_sorted = df.sort_index()
result2 = df_sorted.groupby(pd.Grouper(freq="5D")).sum()
assert_frame_equal(result2, expected)
result3 = df.groupby(pd.Grouper(freq="5D")).sum()
assert_frame_equal(result3, expected)
@pytest.mark.parametrize("should_sort", [True, False])
def test_groupby_with_timegrouper_methods(self, should_sort):
# GH 3881
# make sure API of timegrouper conforms
df = pd.DataFrame(
{
"Branch": "A A A A A B".split(),
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 8, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
)
if should_sort:
df = df.sort_values(by="Quantity", ascending=False)
df = df.set_index("Date", drop=False)
g = df.groupby(pd.Grouper(freq="6M"))
assert g.group_keys
assert isinstance(g.grouper, BinGrouper)
groups = g.groups
assert isinstance(groups, dict)
assert len(groups) == 3
def test_timegrouper_with_reg_groups(self):
# GH 3794
# allow combination of timegrouper/reg groups
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([pd.Grouper(freq="A"), "Buyer"]).sum()
assert_frame_equal(result, expected)
expected = DataFrame(
{
"Buyer": "Carl Mark Carl Joe".split(),
"Quantity": [1, 3, 9, 18],
"Date": [
datetime(2013, 1, 1, 0, 0),
datetime(2013, 1, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([pd.Grouper(freq="6MS"), "Buyer"]).sum()
assert_frame_equal(result, expected)
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 10, 1, 13, 0),
datetime(2013, 10, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 10, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark Carl Joe".split(),
"Quantity": [6, 8, 3, 4, 10],
"Date": [
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 2, 0, 0),
datetime(2013, 10, 2, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([pd.Grouper(freq="1D"), "Buyer"]).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq="1M"), "Buyer"]).sum()
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
assert_frame_equal(result, expected)
# passing the name
df = df.reset_index()
result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
assert_frame_equal(result, expected)
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
df.groupby([pd.Grouper(freq="1M", key="foo"), "Buyer"]).sum()
# passing the level
df = df.set_index("Date")
result = df.groupby([pd.Grouper(freq="1M", level="Date"), "Buyer"]).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum()
assert_frame_equal(result, expected)
with pytest.raises(ValueError):
df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum()
# multi names
df = df.copy()
df["Date"] = df.index + pd.offsets.MonthEnd(2)
result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
],
}
).set_index(["Date", "Buyer"])
assert_frame_equal(result, expected)
# error as we have both a level and a name!
with pytest.raises(ValueError):
df.groupby(
[pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"]
).sum()
# single groupers
expected = DataFrame(
{"Quantity": [31], "Date": [datetime(2013, 10, 31, 0, 0)]}
).set_index("Date")
result = df.groupby(pd.Grouper(freq="1M")).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq="1M")]).sum()
assert_frame_equal(result, expected)
expected = DataFrame(
{"Quantity": [31], "Date": [datetime(2013, 11, 30, 0, 0)]}
).set_index("Date")
result = df.groupby(pd.Grouper(freq="1M", key="Date")).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq="1M", key="Date")]).sum()
assert_frame_equal(result, expected)
@pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"])
def test_timegrouper_with_reg_groups_freq(self, freq):
# GH 6764 multiple grouping with/without sort
df = DataFrame(
{
"date": pd.to_datetime(
[
"20121002",
"20121007",
"20130130",
"20130202",
"20130305",
"20121002",
"20121207",
"20130130",
"20130202",
"20130305",
"20130202",
"20130305",
]
),
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
"whole_cost": [
1790,
364,
280,
259,
201,
623,
90,
312,
359,
301,
359,
801,
],
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
}
).set_index("date")
expected = (
df.groupby("user_id")["whole_cost"]
.resample(freq)
.sum(min_count=1) # XXX
.dropna()
.reorder_levels(["date", "user_id"])
.sort_index()
.astype("int64")
)
expected.name = "whole_cost"
result1 = (
df.sort_index()
.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"]
.sum()
)
assert_series_equal(result1, expected)
result2 = df.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"].sum()
assert_series_equal(result2, expected)
def test_timegrouper_get_group(self):
# GH 6914
df_original = DataFrame(
{
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df_reordered = df_original.sort_values(by="Quantity")
# single grouping
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
for df in [df_original, df_reordered]:
grouped = df.groupby(pd.Grouper(freq="M", key="Date"))
for t, expected in zip(dt_list, expected_list):
dt = pd.Timestamp(t)
result = grouped.get_group(dt)
assert_frame_equal(result, expected)
# multiple grouping
expected_list = [
df_original.iloc[[1]],
df_original.iloc[[3]],
df_original.iloc[[4]],
]
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
for df in [df_original, df_reordered]:
grouped = df.groupby(["Buyer", pd.Grouper(freq="M", key="Date")])
for (b, t), expected in zip(g_list, expected_list):
dt = pd.Timestamp(t)
result = grouped.get_group((b, dt))
assert_frame_equal(result, expected)
# with index
df_original = df_original.set_index("Date")
df_reordered = df_original.sort_values(by="Quantity")
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
for df in [df_original, df_reordered]:
grouped = df.groupby(pd.Grouper(freq="M"))
for t, expected in zip(dt_list, expected_list):
dt = pd.Timestamp(t)
result = grouped.get_group(dt)
assert_frame_equal(result, expected)
def test_timegrouper_apply_return_type_series(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_series(x):
return pd.Series([x["value"].sum()], ("sum",))
expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_series)
result = df_dt.groupby(pd.Grouper(freq="M", key="date")).apply(sumfunc_series)
assert_frame_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_timegrouper_apply_return_type_value(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_value(x):
return x.value.sum()
expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_value)
result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
assert_series_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_groupby_groups_datetimeindex(self):
# GH#1430
periods = 1000
ind = pd.date_range(start="2012/1/1", freq="5min", periods=periods)
df = DataFrame(
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
)
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
# it works!
groups = grouped.groups
assert isinstance(list(groups.keys())[0], datetime)
# GH#11442
index = pd.date_range("2015/01/01", periods=5, name="date")
df = pd.DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
result = df.groupby(level="date").groups
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
expected = {
pd.Timestamp(date): pd.DatetimeIndex([date], name="date") for date in dates
}
tm.assert_dict_equal(result, expected)
grouped = df.groupby(level="date")
for date in dates:
result = grouped.get_group(date)
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
expected_index = pd.DatetimeIndex([date], name="date")
expected = pd.DataFrame(data, columns=list("AB"), index=expected_index)
tm.assert_frame_equal(result, expected)
def test_groupby_groups_datetimeindex_tz(self):
# GH 3950
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"datetime": dates,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
exp_idx1 = pd.DatetimeIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
tz="US/Pacific",
name="datetime",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["datetime", "label"]).sum()
assert_frame_equal(result, expected)
# by level
didx = pd.DatetimeIndex(dates, tz="Asia/Tokyo")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = pd.DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="Asia/Tokyo",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
assert_frame_equal(result, expected)
def test_frame_datetime64_handling_groupby(self):
# it works!
df = DataFrame(
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
columns=["a", "date"],
)
result = df.groupby("a").first()
assert result["date"][3] == Timestamp("2012-07-03")
def test_groupby_multi_timezone(self):
# combining multiple / different timezones yields UTC
data = """0,2000-01-28 16:47:00,America/Chicago
1,2000-01-29 16:48:00,America/Chicago
2,2000-01-30 16:49:00,America/Los_Angeles
3,2000-01-31 16:50:00,America/Chicago
4,2000-01-01 16:50:00,America/New_York"""
df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
result = df.groupby("tz").date.apply(
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
)
expected = Series(
[
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
],
name="date",
dtype=object,
)
assert_series_equal(result, expected)
tz = "America/Chicago"
res_values = df.groupby("tz").date.get_group(tz)
result = pd.to_datetime(res_values).dt.tz_localize(tz)
exp_values = Series(
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
index=[0, 1, 3],
name="date",
)
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
assert_series_equal(result, expected)
def test_groupby_groups_periods(self):
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"period": [pd.Period(d, freq="H") for d in dates],
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
exp_idx1 = pd.PeriodIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
freq="H",
name="period",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["period", "label"]).sum()
assert_frame_equal(result, expected)
# by level
didx = pd.PeriodIndex(dates, freq="H")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = pd.PeriodIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
freq="H",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
assert_frame_equal(result, expected)
def test_groupby_first_datetime64(self):
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
df[1] = df[1].view("M8[ns]")
assert issubclass(df[1].dtype.type, np.datetime64)
result = df.groupby(level=0).first()
got_dt = result[1].dtype
assert issubclass(got_dt.type, np.datetime64)
result = df[1].groupby(level=0).first()
got_dt = result.dtype
assert issubclass(got_dt.type, np.datetime64)
def test_groupby_max_datetime64(self):
# GH 5869
# datetimelike dtype conversion from int
df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5)))
expected = df.groupby("A")["A"].apply(lambda x: x.max())
result = df.groupby("A")["A"].max()
assert_series_equal(result, expected)
def test_groupby_datetime64_32_bit(self):
# GH 6410 / numpy 4328
# 32-bit under 1.9-dev indexing issue
df = DataFrame({"A": range(2), "B": [pd.Timestamp("2000-01-1")] * 2})
result = df.groupby("A")["B"].transform(min)
expected = Series([pd.Timestamp("2000-01-1")] * 2, name="B")
assert_series_equal(result, expected)
def test_groupby_with_timezone_selection(self):
# GH 11616
# Test that column selection returns output in correct timezone.
np.random.seed(42)
df = pd.DataFrame(
{
"factor": np.random.randint(0, 3, size=60),
"time": pd.date_range(
"01/01/2000 00:00", periods=60, freq="s", tz="UTC"
),
}
)
df1 = df.groupby("factor").max()["time"]
df2 = df.groupby("factor")["time"].max()
tm.assert_series_equal(df1, df2)
def test_timezone_info(self):
# see gh-11682: Timezone info lost when broadcasting
# scalar datetime to DataFrame
df = pd.DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
assert df["b"][0].tzinfo == pytz.utc
df = pd.DataFrame({"a": [1, 2, 3]})
df["b"] = datetime.now(pytz.utc)
assert df["b"][0].tzinfo == pytz.utc
def test_datetime_count(self):
df = DataFrame(
{"a": [1, 2, 3] * 2, "dates": pd.date_range("now", periods=6, freq="T")}
)
result = df.groupby("a").dates.count()
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
tm.assert_series_equal(result, expected)
def test_first_last_max_min_on_time_data(self):
# GH 10295
# Verify that NaT is not in the result of max, min, first and last on
# Dataframe with datetime or timedelta values.
from datetime import timedelta as td
df_test = DataFrame(
{
"dt": [
nan,
"2015-07-24 10:10",
"2015-07-25 11:11",
"2015-07-23 12:12",
nan,
],
"td": [nan, td(days=1), td(days=2), td(days=3), nan],
}
)
df_test.dt = pd.to_datetime(df_test.dt)
df_test["group"] = "A"
df_ref = df_test[df_test.dt.notna()]
grouped_test = df_test.groupby("group")
grouped_ref = df_ref.groupby("group")
assert_frame_equal(grouped_ref.max(), grouped_test.max())
assert_frame_equal(grouped_ref.min(), grouped_test.min())
assert_frame_equal(grouped_ref.first(), grouped_test.first())
assert_frame_equal(grouped_ref.last(), grouped_test.last())
def test_nunique_with_timegrouper_and_nat(self):
# GH 17575
test = pd.DataFrame(
{
"time": [
Timestamp("2016-06-28 09:35:35"),
pd.NaT,
Timestamp("2016-06-28 16:46:28"),
],
"data": ["1", "2", "3"],
}
)
grouper = pd.Grouper(key="time", freq="h")
result = test.groupby(grouper)["data"].nunique()
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
tm.assert_series_equal(result, expected)
def test_scalar_call_versus_list_call(self):
# Issue: 17530
data_frame = {
"location": ["shanghai", "beijing", "shanghai"],
"time": pd.Series(
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
dtype="datetime64[ns]",
),
"value": [1, 2, 3],
}
data_frame = pd.DataFrame(data_frame).set_index("time")
grouper = pd.Grouper(freq="D")
grouped = data_frame.groupby(grouper)
result = grouped.count()
grouped = data_frame.groupby([grouper])
expected = grouped.count()
assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,80 @@
"""
these are systematically testing all of the args to value_counts
with different size combinations. This is to ensure stability of the sorting
and proper parameter handling
"""
from itertools import product
import numpy as np
import pytest
from pandas import DataFrame, MultiIndex, Series, date_range
from pandas.util import testing as tm
# our starting frame
def seed_df(seed_nans, n, m):
np.random.seed(1234)
days = date_range("2015-08-24", periods=10)
frame = DataFrame(
{
"1st": np.random.choice(list("abcd"), n),
"2nd": np.random.choice(days, n),
"3rd": np.random.randint(1, m + 1, n),
}
)
if seed_nans:
frame.loc[1::11, "1st"] = np.nan
frame.loc[3::17, "2nd"] = np.nan
frame.loc[7::19, "3rd"] = np.nan
frame.loc[8::19, "3rd"] = np.nan
frame.loc[9::19, "3rd"] = np.nan
return frame
# create input df, keys, and the bins
binned = []
ids = []
for seed_nans in [True, False]:
for n, m in product((100, 1000), (5, 20)):
df = seed_df(seed_nans, n, m)
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
keys = "1st", "2nd", ["1st", "2nd"]
for k, b in product(keys, bins):
binned.append((df, k, b, n, m))
ids.append("{}-{}-{}".format(k, n, m))
@pytest.mark.slow
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
def test_series_groupby_value_counts(df, keys, bins, n, m):
def rebuild_index(df):
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
return df
for isort, normalize, sort, ascending, dropna in product((False, True), repeat=5):
kwargs = dict(
normalize=normalize,
sort=sort,
ascending=ascending,
dropna=dropna,
bins=bins,
)
gr = df.groupby(keys, sort=isort)
left = gr["3rd"].value_counts(**kwargs)
gr = df.groupby(keys, sort=isort)
right = gr["3rd"].apply(Series.value_counts, **kwargs)
right.index.names = right.index.names[:-1] + ["3rd"]
# have to sort on index because of unstable sort on values
left, right = map(rebuild_index, (left, right)) # xref GH9212
tm.assert_series_equal(left.sort_index(), right.sort_index())

View File

@@ -0,0 +1,378 @@
"""
test methods relating to generic function evaluation
the so-called white/black lists
"""
from string import ascii_lowercase
import numpy as np
import pytest
from pandas import DataFrame, Index, MultiIndex, Series, date_range
from pandas.util import testing as tm
AGG_FUNCTIONS = [
"sum",
"prod",
"min",
"max",
"median",
"mean",
"skew",
"mad",
"std",
"var",
"sem",
]
AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"]
df_whitelist = [
"quantile",
"fillna",
"mad",
"take",
"idxmax",
"idxmin",
"tshift",
"skew",
"plot",
"hist",
"dtypes",
"corrwith",
"corr",
"cov",
"diff",
]
@pytest.fixture(params=df_whitelist)
def df_whitelist_fixture(request):
return request.param
s_whitelist = [
"quantile",
"fillna",
"mad",
"take",
"idxmax",
"idxmin",
"tshift",
"skew",
"plot",
"hist",
"dtype",
"corr",
"cov",
"diff",
"unique",
"nlargest",
"nsmallest",
"is_monotonic_increasing",
"is_monotonic_decreasing",
]
@pytest.fixture(params=s_whitelist)
def s_whitelist_fixture(request):
return request.param
@pytest.fixture
def mframe():
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
@pytest.fixture
def df():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
@pytest.fixture
def df_letters():
letters = np.array(list(ascii_lowercase))
N = 10
random_letters = letters.take(np.random.randint(0, 26, N))
df = DataFrame(
{
"floats": N / 10 * Series(np.random.random(N)),
"letters": Series(random_letters),
}
)
return df
@pytest.mark.parametrize("whitelist", [df_whitelist, s_whitelist])
def test_groupby_whitelist(df_letters, whitelist):
df = df_letters
if whitelist == df_whitelist:
# dataframe
obj = df_letters
else:
obj = df_letters["floats"]
gb = obj.groupby(df.letters)
assert set(whitelist) == set(gb._apply_whitelist)
def check_whitelist(obj, df, m):
# check the obj for a particular whitelist m
gb = obj.groupby(df.letters)
f = getattr(type(gb), m)
# name
try:
n = f.__name__
except AttributeError:
return
assert n == m
# qualname
try:
n = f.__qualname__
except AttributeError:
return
assert n.endswith(m)
def test_groupby_series_whitelist(df_letters, s_whitelist_fixture):
m = s_whitelist_fixture
df = df_letters
check_whitelist(df.letters, df, m)
def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture):
m = df_whitelist_fixture
df = df_letters
check_whitelist(df, df, m)
@pytest.fixture
def raw_frame():
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
raw_frame = DataFrame(
np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")
)
raw_frame.iloc[1, [1, 2]] = np.nan
raw_frame.iloc[7, [0, 1]] = np.nan
return raw_frame
@pytest.mark.parametrize("op", AGG_FUNCTIONS)
@pytest.mark.parametrize("level", [0, 1])
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("sort", [True, False])
def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort):
# GH6944
# GH 17537
# explicitly test the whitelist methods
if axis == 0:
frame = raw_frame
else:
frame = raw_frame.T
if op in AGG_FUNCTIONS_WITH_SKIPNA:
grouped = frame.groupby(level=level, axis=axis, sort=sort)
result = getattr(grouped, op)(skipna=skipna)
expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
if sort:
expected = expected.sort_index(axis=axis, level=level)
tm.assert_frame_equal(result, expected)
else:
grouped = frame.groupby(level=level, axis=axis, sort=sort)
result = getattr(grouped, op)()
expected = getattr(frame, op)(level=level, axis=axis)
if sort:
expected = expected.sort_index(axis=axis, level=level)
tm.assert_frame_equal(result, expected)
def test_groupby_blacklist(df_letters):
df = df_letters
s = df_letters.floats
blacklist = [
"eval",
"query",
"abs",
"where",
"mask",
"align",
"groupby",
"clip",
"astype",
"at",
"combine",
"consolidate",
"convert_objects",
]
to_methods = [method for method in dir(df) if method.startswith("to_")]
blacklist.extend(to_methods)
# e.g., to_csv
defined_but_not_allowed = "(?:^Cannot.+{0!r}.+{1!r}.+try using the 'apply' method$)"
# e.g., query, eval
not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
fmt = defined_but_not_allowed + "|" + not_defined
for bl in blacklist:
for obj in (df, s):
gb = obj.groupby(df.letters)
msg = fmt.format(bl, type(gb).__name__)
with pytest.raises(AttributeError, match=msg):
getattr(gb, bl)
def test_tab_completion(mframe):
grp = mframe.groupby(level="second")
results = {v for v in dir(grp) if not v.startswith("_")}
expected = {
"A",
"B",
"C",
"agg",
"aggregate",
"apply",
"boxplot",
"filter",
"first",
"get_group",
"groups",
"hist",
"indices",
"last",
"max",
"mean",
"median",
"min",
"ngroups",
"nth",
"ohlc",
"plot",
"prod",
"size",
"std",
"sum",
"transform",
"var",
"sem",
"count",
"nunique",
"head",
"describe",
"cummax",
"quantile",
"rank",
"cumprod",
"tail",
"resample",
"cummin",
"fillna",
"cumsum",
"cumcount",
"ngroup",
"all",
"shift",
"skew",
"take",
"tshift",
"pct_change",
"any",
"mad",
"corr",
"corrwith",
"cov",
"dtypes",
"ndim",
"diff",
"idxmax",
"idxmin",
"ffill",
"bfill",
"pad",
"backfill",
"rolling",
"expanding",
"pipe",
}
assert results == expected
def test_groupby_function_rename(mframe):
grp = mframe.groupby(level="second")
for name in ["sum", "prod", "min", "max", "first", "last"]:
f = getattr(grp, name)
assert f.__name__ == name
def test_groupby_selection_with_methods(df):
# some methods which require DatetimeIndex
rng = date_range("2014", periods=len(df))
df.index = rng
g = df.groupby(["A"])[["C"]]
g_exp = df[["C"]].groupby(df["A"])
# TODO check groupby with > 1 col ?
# methods which are called as .foo()
methods = [
"count",
"corr",
"cummax",
"cummin",
"cumprod",
"describe",
"rank",
"quantile",
"diff",
"shift",
"all",
"any",
"idxmin",
"idxmax",
"ffill",
"bfill",
"pct_change",
"tshift",
]
for m in methods:
res = getattr(g, m)()
exp = getattr(g_exp, m)()
# should always be frames!
tm.assert_frame_equal(res, exp)
# methods which aren't just .foo()
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
tm.assert_frame_equal(
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
)