8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,862 @@
import numpy as np
from numpy.random import randn
import pytest
from pandas._libs import join as libjoin
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
a_ = np.array
class TestJoin:
def setup_method(self, method):
# aggregate multiple columns
self.df = DataFrame(
{
"key1": get_test_data(),
"key2": get_test_data(),
"data1": np.random.randn(N),
"data2": np.random.randn(N),
}
)
# exclude a couple keys for fun
self.df = self.df[self.df["key2"] > 1]
self.df2 = DataFrame(
{
"key1": get_test_data(n=N // 5),
"key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5),
"value": np.random.randn(N // 5),
}
)
index, data = tm.getMixedTypeDict()
self.target = DataFrame(data, index=index)
# Join on string value
self.source = DataFrame(
{"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
)
def test_cython_left_outer_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
ls, rs = libjoin.left_outer_join(left, right, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_cython_right_outer_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
rs, ls = libjoin.left_outer_join(right, left, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
# 0 1 1 1
exp_li = a_(
[
0,
1,
2,
3,
4,
5,
3,
4,
5,
3,
4,
5,
# 2 2 4
6,
7,
8,
6,
7,
8,
-1,
]
)
exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_cython_inner_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
max_group = 5
ls, rs = libjoin.inner_join(left, right, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_left_outer_join(self):
joined_key2 = merge(self.df, self.df2, on="key2")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="left")
joined_both = merge(self.df, self.df2)
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left")
def test_right_outer_join(self):
joined_key2 = merge(self.df, self.df2, on="key2", how="right")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="right")
joined_both = merge(self.df, self.df2, how="right")
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right")
def test_full_outer_join(self):
joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")
joined_both = merge(self.df, self.df2, how="outer")
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer")
def test_inner_join(self):
joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")
joined_both = merge(self.df, self.df2, how="inner")
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")
def test_handle_overlap(self):
joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"])
assert "key1.foo" in joined
assert "key1.bar" in joined
def test_handle_overlap_arbitrary_key(self):
joined = merge(
self.df,
self.df2,
left_on="key2",
right_on="key1",
suffixes=[".foo", ".bar"],
)
assert "key1.foo" in joined
assert "key2.bar" in joined
def test_join_on(self):
target = self.target
source = self.source
merged = target.join(source, on="C")
tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)
# join with duplicates (fix regression from DataFrame/Matrix merge)
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
joined = df.join(df2, on="key")
expected = DataFrame(
{"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
)
assert_frame_equal(joined, expected)
# Test when some are missing
df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
joined = df_a.join(df_b, on="one")
joined = joined.join(df_c, on="one")
assert np.isnan(joined["two"]["c"])
assert np.isnan(joined["three"]["c"])
# merge column not p resent
with pytest.raises(KeyError, match="^'E'$"):
target.join(source, on="E")
# overlap
source_copy = source.copy()
source_copy["A"] = 0
msg = (
"You are trying to merge on float64 and object columns. If"
" you wish to proceed you should use pd.concat"
)
with pytest.raises(ValueError, match=msg):
target.join(source_copy, on="A")
def test_join_on_fails_with_different_right_index(self):
df = DataFrame(
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
)
df2 = DataFrame(
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
index=tm.makeCustomIndex(10, 2),
)
msg = (
r"len\(left_on\) must equal the number of levels in the index" ' of "right"'
)
with pytest.raises(ValueError, match=msg):
merge(df, df2, left_on="a", right_index=True)
def test_join_on_fails_with_different_left_index(self):
df = DataFrame(
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)},
index=tm.makeCustomIndex(3, 2),
)
df2 = DataFrame(
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}
)
msg = (
r"len\(right_on\) must equal the number of levels in the index" ' of "left"'
)
with pytest.raises(ValueError, match=msg):
merge(df, df2, right_on="b", left_index=True)
def test_join_on_fails_with_different_column_counts(self):
df = DataFrame(
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
)
df2 = DataFrame(
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
index=tm.makeCustomIndex(10, 2),
)
msg = r"len\(right_on\) must equal len\(left_on\)"
with pytest.raises(ValueError, match=msg):
merge(df, df2, right_on="a", left_on=["a", "b"])
@pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
def test_join_on_fails_with_wrong_object_type(self, wrong_type):
# GH12081 - original issue
# GH21220 - merging of Series and DataFrame is now allowed
# Edited test to remove the Series object from test parameters
df = DataFrame({"a": [1, 1]})
msg = "Can only merge Series or DataFrame objects, a {} was passed".format(
str(type(wrong_type))
)
with pytest.raises(TypeError, match=msg):
merge(wrong_type, df, left_on="a", right_on="a")
with pytest.raises(TypeError, match=msg):
merge(df, wrong_type, left_on="a", right_on="a")
def test_join_on_pass_vector(self):
expected = self.target.join(self.source, on="C")
del expected["C"]
join_col = self.target.pop("C")
result = self.target.join(self.source, on=join_col)
assert_frame_equal(result, expected)
def test_join_with_len0(self):
# nothing to merge
merged = self.target.join(self.source.reindex([]), on="C")
for col in self.source:
assert col in merged
assert merged[col].isna().all()
merged2 = self.target.join(self.source.reindex([]), on="C", how="inner")
tm.assert_index_equal(merged2.columns, merged.columns)
assert len(merged2) == 0
def test_join_on_inner(self):
df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])
joined = df.join(df2, on="key", how="inner")
expected = df.join(df2, on="key")
expected = expected[expected["value"].notna()]
tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False)
tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
tm.assert_index_equal(joined.index, expected.index)
def test_join_on_singlekey_list(self):
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
# corner cases
joined = df.join(df2, on=["key"])
expected = df.join(df2, on="key")
assert_frame_equal(joined, expected)
def test_join_on_series(self):
result = self.target.join(self.source["MergedA"], on="C")
expected = self.target.join(self.source[["MergedA"]], on="C")
assert_frame_equal(result, expected)
def test_join_on_series_buglet(self):
# GH #638
df = DataFrame({"a": [1, 1]})
ds = Series([2], index=[1], name="b")
result = df.join(ds, on="a")
expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
tm.assert_frame_equal(result, expected)
def test_join_index_mixed(self, join_type):
# no overlapping blocks
df1 = DataFrame(index=np.arange(10))
df1["bool"] = True
df1["string"] = "foo"
df2 = DataFrame(index=np.arange(5, 15))
df2["int"] = 1
df2["float"] = 1.0
joined = df1.join(df2, how=join_type)
expected = _join_by_hand(df1, df2, how=join_type)
assert_frame_equal(joined, expected)
joined = df2.join(df1, how=join_type)
expected = _join_by_hand(df2, df1, how=join_type)
assert_frame_equal(joined, expected)
def test_join_index_mixed_overlap(self):
df1 = DataFrame(
{"A": 1.0, "B": 2, "C": "foo", "D": True},
index=np.arange(10),
columns=["A", "B", "C", "D"],
)
assert df1["B"].dtype == np.int64
assert df1["D"].dtype == np.bool_
df2 = DataFrame(
{"A": 1.0, "B": 2, "C": "foo", "D": True},
index=np.arange(0, 10, 2),
columns=["A", "B", "C", "D"],
)
# overlap
joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
expected_columns = [
"A_one",
"B_one",
"C_one",
"D_one",
"A_two",
"B_two",
"C_two",
"D_two",
]
df1.columns = expected_columns[:4]
df2.columns = expected_columns[4:]
expected = _join_by_hand(df1, df2)
assert_frame_equal(joined, expected)
def test_join_empty_bug(self):
# generated an exception in 0.4.3
x = DataFrame()
x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")
def test_join_unconsolidated(self):
# GH #331
a = DataFrame(randn(30, 2), columns=["a", "b"])
c = Series(randn(30))
a["c"] = c
d = DataFrame(randn(30, 1), columns=["q"])
# it works!
a.join(d)
d.join(a)
def test_join_multiindex(self):
index1 = MultiIndex.from_arrays(
[["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
names=["first", "second"],
)
index2 = MultiIndex.from_arrays(
[["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
names=["first", "second"],
)
df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])
df1 = df1.sort_index(level=0)
df2 = df2.sort_index(level=0)
joined = df1.join(df2, how="outer")
ex_index = Index(index1.values).union(Index(index2.values))
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
expected.index.names = index1.names
assert_frame_equal(joined, expected)
assert joined.index.names == index1.names
df1 = df1.sort_index(level=1)
df2 = df2.sort_index(level=1)
joined = df1.join(df2, how="outer").sort_index(level=0)
ex_index = Index(index1.values).union(Index(index2.values))
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
expected.index.names = index1.names
assert_frame_equal(joined, expected)
assert joined.index.names == index1.names
def test_join_inner_multiindex(self):
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = [
"two",
"one",
"three",
"one",
"two",
"one",
"two",
"two",
"three",
"one",
]
data = np.random.randn(len(key1))
data = DataFrame({"key1": key1, "key2": key2, "data": data})
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
to_join = DataFrame(
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
)
joined = data.join(to_join, on=["key1", "key2"], how="inner")
expected = merge(
data,
to_join.reset_index(),
left_on=["key1", "key2"],
right_on=["first", "second"],
how="inner",
sort=False,
)
expected2 = merge(
to_join,
data,
right_on=["key1", "key2"],
left_index=True,
how="inner",
sort=False,
)
assert_frame_equal(joined, expected2.reindex_like(joined))
expected2 = merge(
to_join,
data,
right_on=["key1", "key2"],
left_index=True,
how="inner",
sort=False,
)
expected = expected.drop(["first", "second"], axis=1)
expected.index = joined.index
assert joined.index.is_monotonic
assert_frame_equal(joined, expected)
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
def test_join_hierarchical_mixed(self):
# GH 2024
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
other_df.set_index("a", inplace=True)
# GH 9455, 12219
with tm.assert_produces_warning(UserWarning):
result = merge(new_df, other_df, left_index=True, right_index=True)
assert ("b", "mean") in result
assert "b" in result
def test_join_float64_float32(self):
a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64)
b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32)
joined = a.join(b)
assert joined.dtypes["a"] == "float64"
assert joined.dtypes["b"] == "float64"
assert joined.dtypes["c"] == "float32"
a = np.random.randint(0, 5, 100).astype("int64")
b = np.random.random(100).astype("float64")
c = np.random.random(100).astype("float32")
df = DataFrame({"a": a, "b": b, "c": c})
xpdf = DataFrame({"a": a, "b": b, "c": c})
s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
rs = df.merge(s, left_on="a", right_index=True)
assert rs.dtypes["a"] == "int64"
assert rs.dtypes["b"] == "float64"
assert rs.dtypes["c"] == "float32"
assert rs.dtypes["md"] == "float32"
xp = xpdf.merge(s, left_on="a", right_index=True)
assert_frame_equal(rs, xp)
def test_join_many_non_unique_index(self):
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
idf1 = df1.set_index(["a", "b"])
idf2 = df2.set_index(["a", "b"])
idf3 = df3.set_index(["a", "b"])
result = idf1.join([idf2, idf3], how="outer")
df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")
result = result.reset_index()
expected = expected[result.columns]
expected["a"] = expected.a.astype("int64")
expected["b"] = expected.b.astype("int64")
assert_frame_equal(result, expected)
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
idf1 = df1.set_index(["a", "b"])
idf2 = df2.set_index(["a", "b"])
idf3 = df3.set_index(["a", "b"])
result = idf1.join([idf2, idf3], how="inner")
df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")
result = result.reset_index()
assert_frame_equal(result, expected.loc[:, result.columns])
# GH 11519
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
s = Series(
np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
)
inner = df.join(s, how="inner")
outer = df.join(s, how="outer")
left = df.join(s, how="left")
right = df.join(s, how="right")
assert_frame_equal(inner, outer)
assert_frame_equal(inner, left)
assert_frame_equal(inner, right)
def test_join_sort(self):
left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
joined = left.join(right, on="key", sort=True)
expected = DataFrame(
{
"key": ["bar", "baz", "foo", "foo"],
"value": [2, 3, 1, 4],
"value2": ["a", "b", "c", "c"],
},
index=[1, 2, 0, 3],
)
assert_frame_equal(joined, expected)
# smoke test
joined = left.join(right, on="key", sort=False)
tm.assert_index_equal(joined.index, pd.Index(list(range(4))))
def test_join_mixed_non_unique_index(self):
# GH 12814, unorderable types in py3 with a non-unique index
df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
result = df1.join(df2)
expected = DataFrame(
{"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
index=[1, 2, 3, 3, "a"],
)
tm.assert_frame_equal(result, expected)
df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
result = df3.join(df4)
expected = DataFrame(
{"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
)
tm.assert_frame_equal(result, expected)
def test_join_non_unique_period_index(self):
# GH #16871
index = pd.period_range("2016-01-01", periods=16, freq="M")
df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"])
df2 = concat([df, df])
result = df.join(df2, how="inner", rsuffix="_df2")
expected = DataFrame(
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
columns=["pnum", "pnum_df2"],
index=df2.sort_index().index,
)
tm.assert_frame_equal(result, expected)
def test_mixed_type_join_with_suffix(self):
# GH #916
df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
df.insert(0, "id", 0)
df.insert(5, "dt", "foo")
grouped = df.groupby("id")
mn = grouped.mean()
cn = grouped.count()
# it works!
mn.join(cn, rsuffix="_right")
def test_join_many(self):
df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]
joined = df_list[0].join(df_list[1:])
tm.assert_frame_equal(joined, df)
df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]
def _check_diff_index(df_list, result, exp_index):
reindexed = [x.reindex(exp_index) for x in df_list]
expected = reindexed[0].join(reindexed[1:])
tm.assert_frame_equal(result, expected)
# different join types
joined = df_list[0].join(df_list[1:], how="outer")
_check_diff_index(df_list, joined, df.index)
joined = df_list[0].join(df_list[1:])
_check_diff_index(df_list, joined, df_list[0].index)
joined = df_list[0].join(df_list[1:], how="inner")
_check_diff_index(df_list, joined, df.index[2:8])
msg = "Joining multiple DataFrames only supported for joining on index"
with pytest.raises(ValueError, match=msg):
df_list[0].join(df_list[1:], on="a")
def test_join_many_mixed(self):
df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
df["key"] = ["foo", "bar"] * 4
df1 = df.loc[:, ["A", "B"]]
df2 = df.loc[:, ["C", "D"]]
df3 = df.loc[:, ["key"]]
result = df1.join([df2, df3])
assert_frame_equal(result, df)
def test_join_dups(self):
# joining dups
df = concat(
[
DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
DataFrame(
np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
),
],
axis=1,
)
expected = concat([df, df], axis=1)
result = df.join(df, rsuffix="_2")
result.columns = expected.columns
assert_frame_equal(result, expected)
# GH 4975, invalid join on dups
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
dta = x.merge(y, left_index=True, right_index=True).merge(
z, left_index=True, right_index=True, how="outer"
)
dta = dta.merge(w, left_index=True, right_index=True)
expected = concat([x, y, z, w], axis=1)
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
assert_frame_equal(dta, expected)
def test_join_multi_to_multi(self, join_type):
# GH 20475
leftindex = MultiIndex.from_product(
[list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
)
left = DataFrame({"v1": range(12)}, index=leftindex)
rightindex = MultiIndex.from_product(
[list("abc"), list("xy")], names=["abc", "xy"]
)
right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)
result = left.join(right, on=["abc", "xy"], how=join_type)
expected = (
left.reset_index()
.merge(right.reset_index(), on=["abc", "xy"], how=join_type)
.set_index(["abc", "xy", "num"])
)
assert_frame_equal(expected, result)
msg = (
r"len\(left_on\) must equal the number of levels in the index" ' of "right"'
)
with pytest.raises(ValueError, match=msg):
left.join(right, on="xy", how=join_type)
with pytest.raises(ValueError, match=msg):
right.join(left, on=["abc", "xy"], how=join_type)
def test_join_on_tz_aware_datetimeindex(self):
# GH 23931, 26335
df1 = pd.DataFrame(
{
"date": pd.date_range(
start="2018-01-01", periods=5, tz="America/Chicago"
),
"vals": list("abcde"),
}
)
df2 = pd.DataFrame(
{
"date": pd.date_range(
start="2018-01-03", periods=5, tz="America/Chicago"
),
"vals_2": list("tuvwx"),
}
)
result = df1.join(df2.set_index("date"), on="date")
expected = df1.copy()
expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object)
assert_frame_equal(result, expected)
def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
# some smoke tests
for c in join_col:
assert result[c].notna().all()
left_grouped = left.groupby(join_col)
right_grouped = right.groupby(join_col)
for group_key, group in result.groupby(join_col):
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
try:
lgroup = left_grouped.get_group(group_key)
except KeyError:
if how in ("left", "inner"):
raise AssertionError(
"key {group_key!s} should not have been in the join".format(
group_key=group_key
)
)
_assert_all_na(l_joined, left.columns, join_col)
else:
_assert_same_contents(l_joined, lgroup)
try:
rgroup = right_grouped.get_group(group_key)
except KeyError:
if how in ("right", "inner"):
raise AssertionError(
"key {group_key!s} should not have been in the join".format(
group_key=group_key
)
)
_assert_all_na(r_joined, right.columns, join_col)
else:
_assert_same_contents(r_joined, rgroup)
def _restrict_to_columns(group, columns, suffix):
found = [
c for c in group.columns if c in columns or c.replace(suffix, "") in columns
]
# filter
group = group.loc[:, found]
# get rid of suffixes, if any
group = group.rename(columns=lambda x: x.replace(suffix, ""))
# put in the right order...
group = group.loc[:, columns]
return group
def _assert_same_contents(join_chunk, source):
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
rows = {tuple(row) for row in jvalues}
assert len(rows) == len(source)
assert all(tuple(row) in rows for row in svalues)
def _assert_all_na(join_chunk, source_columns, join_col):
for c in source_columns:
if c in join_col:
continue
assert join_chunk[c].isna().all()
def _join_by_hand(a, b, how="left"):
join_index = a.index.join(b.index, how=how)
a_re = a.reindex(join_index)
b_re = b.reindex(join_index)
result_columns = a.columns.append(b.columns)
for col, s in b_re.items():
a_re[col] = s
return a_re.reindex(columns=result_columns)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,188 @@
import numpy as np
import pytest
from pandas import DataFrame
from pandas.util.testing import assert_frame_equal
@pytest.fixture
def df1():
return DataFrame(
dict(
outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
v1=np.linspace(0, 1, 11),
)
)
@pytest.fixture
def df2():
return DataFrame(
dict(
outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
v2=np.linspace(10, 11, 12),
)
)
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def left_df(request, df1):
""" Construct left test DataFrame with specified levels
(any of 'outer', 'inner', and 'v1')"""
levels = request.param
if levels:
df1 = df1.set_index(levels)
return df1
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def right_df(request, df2):
""" Construct right test DataFrame with specified levels
(any of 'outer', 'inner', and 'v2')"""
levels = request.param
if levels:
df2 = df2.set_index(levels)
return df2
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
"""
Compute the expected merge result for the test case.
This method computes the expected result of merging two DataFrames on
a combination of their columns and index levels. It does so by
explicitly dropping/resetting their named index levels, performing a
merge on their columns, and then finally restoring the appropriate
index in the result.
Parameters
----------
df_left : DataFrame
The left DataFrame (may have zero or more named index levels)
df_right : DataFrame
The right DataFrame (may have zero or more named index levels)
on : list of str
The on parameter to the merge operation
left_on : list of str
The left_on parameter to the merge operation
right_on : list of str
The right_on parameter to the merge operation
how : str
The how parameter to the merge operation
Returns
-------
DataFrame
The expected merge result
"""
# Handle on param if specified
if on is not None:
left_on, right_on = on, on
# Compute input named index levels
left_levels = [n for n in df_left.index.names if n is not None]
right_levels = [n for n in df_right.index.names if n is not None]
# Compute output named index levels
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
# Drop index levels that aren't involved in the merge
drop_left = [n for n in left_levels if n not in left_on]
if drop_left:
df_left = df_left.reset_index(drop_left, drop=True)
drop_right = [n for n in right_levels if n not in right_on]
if drop_right:
df_right = df_right.reset_index(drop_right, drop=True)
# Convert remaining index levels to columns
reset_left = [n for n in left_levels if n in left_on]
if reset_left:
df_left = df_left.reset_index(level=reset_left)
reset_right = [n for n in right_levels if n in right_on]
if reset_right:
df_right = df_right.reset_index(level=reset_right)
# Perform merge
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
# Restore index levels
if output_levels:
expected = expected.set_index(output_levels)
return expected
@pytest.mark.parametrize(
"on,how",
[
(["outer"], "inner"),
(["inner"], "left"),
(["outer", "inner"], "right"),
(["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
# Construct expected result
expected = compute_expected(left_df, right_df, on=on, how=how)
# Perform merge
result = left_df.merge(right_df, on=on, how=how)
assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize(
"left_on,right_on,how",
[
(["outer"], ["outer"], "inner"),
(["inner"], ["inner"], "right"),
(["outer", "inner"], ["outer", "inner"], "left"),
(["inner", "outer"], ["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_lefton_righton(
left_df, right_df, left_on, right_on, how
):
# Construct expected result
expected = compute_expected(
left_df, right_df, left_on=left_on, right_on=right_on, how=how
)
# Perform merge
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
# Construct left_df
left_df = df1.set_index(left_index)
# Construct right_df
right_df = df2.set_index(["outer", "inner"])
# Result
expected = (
left_df.reset_index()
.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
.set_index(left_index)
)
# Perform join
result = left_df.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
assert_frame_equal(result, expected, check_like=True)

View File

@@ -0,0 +1,117 @@
from numpy import nan
import pytest
import pandas as pd
from pandas import DataFrame, merge_ordered
from pandas.util.testing import assert_frame_equal
class TestMergeOrdered:
def setup_method(self, method):
self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
def test_basic(self):
result = merge_ordered(self.left, self.right, on="key")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1, nan, 2, nan, 3, nan],
"rvalue": [nan, 1, 2, 3, nan, 4],
}
)
assert_frame_equal(result, expected)
def test_ffill(self):
result = merge_ordered(self.left, self.right, on="key", fill_method="ffill")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
"rvalue": [nan, 1, 2, 3, 3, 4],
}
)
assert_frame_equal(result, expected)
def test_multigroup(self):
left = pd.concat([self.left, self.left], ignore_index=True)
left["group"] = ["a"] * 3 + ["b"] * 3
result = merge_ordered(
left, self.right, on="key", left_by="group", fill_method="ffill"
)
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"] * 2,
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
"rvalue": [nan, 1, 2, 3, 3, 4] * 2,
}
)
expected["group"] = ["a"] * 6 + ["b"] * 6
assert_frame_equal(result, expected.loc[:, result.columns])
result2 = merge_ordered(
self.right, left, on="key", right_by="group", fill_method="ffill"
)
assert_frame_equal(result, result2.loc[:, result.columns])
result = merge_ordered(left, self.right, on="key", left_by="group")
assert result["group"].notna().all()
def test_merge_type(self):
class NotADataFrame(DataFrame):
@property
def _constructor(self):
return NotADataFrame
nad = NotADataFrame(self.left)
result = nad.merge(self.right, on="key")
assert isinstance(result, NotADataFrame)
def test_empty_sequence_concat(self):
# GH 9157
empty_pat = "[Nn]o objects"
none_pat = "objects.*None"
test_cases = [
((), empty_pat),
([], empty_pat),
({}, empty_pat),
([None], none_pat),
([None, None], none_pat),
]
for df_seq, pattern in test_cases:
with pytest.raises(ValueError, match=pattern):
pd.concat(df_seq)
pd.concat([pd.DataFrame()])
pd.concat([None, pd.DataFrame()])
pd.concat([pd.DataFrame(), None])
def test_doc_example(self):
left = DataFrame(
{
"group": list("aaabbb"),
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3] * 2,
}
)
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
expected = DataFrame(
{
"group": list("aaaaabbbbb"),
"key": ["a", "b", "c", "d", "e"] * 2,
"lvalue": [1, 1, 2, 2, 3] * 2,
"rvalue": [nan, 1, 2, 3, 3] * 2,
}
)
assert_frame_equal(result, expected)

View File

@@ -0,0 +1,810 @@
from collections import OrderedDict
import numpy as np
from numpy import nan
from numpy.random import randn
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
import pandas.util.testing as tm
@pytest.fixture
def left():
"""left dataframe (not multi-indexed) for multi-index join tests"""
# a little relevant example with NAs
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
data = np.random.randn(len(key1))
return DataFrame({"key1": key1, "key2": key2, "data": data})
@pytest.fixture
def right():
"""right dataframe (multi-indexed) for multi-index join tests"""
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["key1", "key2"],
)
return DataFrame(
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
)
@pytest.fixture
def left_multi():
return DataFrame(
dict(
Origin=["A", "A", "B", "B", "C"],
Destination=["A", "B", "A", "C", "A"],
Period=["AM", "AM", "IP", "AM", "OP"],
TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"],
Trips=[1987, 3647, 2470, 4296, 4444],
),
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
).set_index(["Origin", "Destination", "Period", "TripPurp"])
@pytest.fixture
def right_multi():
return DataFrame(
dict(
Origin=["A", "A", "B", "B", "C", "C", "E"],
Destination=["A", "B", "A", "B", "A", "B", "F"],
Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
LinkType=["a", "b", "c", "b", "a", "b", "a"],
Distance=[100, 80, 90, 80, 75, 35, 55],
),
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
).set_index(["Origin", "Destination", "Period", "LinkType"])
@pytest.fixture
def on_cols_multi():
return ["Origin", "Destination", "Period"]
@pytest.fixture
def idx_cols_multi():
return ["Origin", "Destination", "Period", "TripPurp", "LinkType"]
class TestMergeMulti:
def setup_method(self):
self.index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
self.to_join = DataFrame(
np.random.randn(10, 3),
index=self.index,
columns=["j_one", "j_two", "j_three"],
)
# a little relevant example with NAs
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = [
"two",
"one",
"three",
"one",
"two",
"one",
"two",
"two",
"three",
"one",
]
data = np.random.randn(len(key1))
self.data = DataFrame({"key1": key1, "key2": key2, "data": data})
def test_merge_on_multikey(self, left, right, join_type):
on_cols = ["key1", "key2"]
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
drop=True
)
expected = pd.merge(
left, right.reset_index(), on=on_cols, how=join_type, sort=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("sort", [False, True])
def test_left_join_multi_index(self, left, right, sort):
icols = ["1st", "2nd", "3rd"]
def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord("a")
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4
def run_asserts(left, right, sort):
res = left.join(right, on=icols, how="left", sort=sort)
assert len(left) < len(res) + 1
assert not res["4th"].isna().any()
assert not res["5th"].isna().any()
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
result = bind_cols(res.iloc[:, :-2])
tm.assert_series_equal(res["4th"], result, check_names=False)
assert result.name is None
if sort:
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
res.index = np.arange(len(res))
tm.assert_frame_equal(out, res)
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"])
left.insert(1, "2nd", np.random.randint(0, 1000, len(left)))
i = np.random.permutation(len(left))
right = left.iloc[i].copy()
left["4th"] = bind_cols(left)
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
# inject some nulls
left.loc[1::23, "1st"] = np.nan
left.loc[2::37, "2nd"] = np.nan
left.loc[3::43, "3rd"] = np.nan
left["4th"] = bind_cols(left)
i = np.random.permutation(len(left))
right = left.iloc[i, :-1]
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
@pytest.mark.parametrize("sort", [False, True])
def test_merge_right_vs_left(self, left, right, sort):
# compare left vs right merge with multikey
on_cols = ["key1", "key2"]
merged_left_right = left.merge(
right, left_on=on_cols, right_index=True, how="left", sort=sort
)
merge_right_left = right.merge(
left, right_on=on_cols, left_index=True, how="right", sort=sort
)
# Reorder columns
merge_right_left = merge_right_left[merged_left_right.columns]
tm.assert_frame_equal(merged_left_right, merge_right_left)
def test_compress_group_combinations(self):
# ~ 40000000 possible unique groups
key1 = tm.rands_array(10, 10000)
key1 = np.tile(key1, 2)
key2 = key1[::-1]
df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})
df2 = DataFrame(
{"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}
)
# just to hit the label compression code path
merge(df, df2, how="outer")
def test_left_join_index_preserve_order(self):
on_cols = ["k1", "k2"]
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"v": np.array(np.arange(24), dtype=np.int64),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result.sort_values(on_cols, kind="mergesort", inplace=True)
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
# test join with multi dtypes blocks
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
"v": np.array(np.arange(24), dtype=np.int32),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result = result.sort_values(on_cols, kind="mergesort")
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match_multiindex(self):
left = DataFrame(
[
["X", "Y", "C", "a"],
["W", "Y", "C", "e"],
["V", "Q", "A", "h"],
["V", "R", "D", "i"],
["X", "Y", "D", "b"],
["X", "Y", "A", "c"],
["W", "Q", "B", "f"],
["W", "R", "C", "g"],
["V", "Y", "C", "j"],
["X", "Y", "B", "d"],
],
columns=["cola", "colb", "colc", "tag"],
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
)
right = DataFrame(
[
["W", "R", "C", 0],
["W", "Q", "B", 3],
["W", "Q", "B", 8],
["X", "Y", "A", 1],
["X", "Y", "A", 4],
["X", "Y", "B", 5],
["X", "Y", "C", 6],
["X", "Y", "C", 9],
["X", "Q", "C", -6],
["X", "R", "C", -9],
["V", "Y", "C", 7],
["V", "R", "D", 2],
["V", "R", "D", -1],
["V", "Q", "A", -3],
],
columns=["col1", "col2", "col3", "val"],
).set_index(["col1", "col2", "col3"])
result = left.join(right, on=["cola", "colb", "colc"], how="left")
expected = DataFrame(
[
["X", "Y", "C", "a", 6],
["X", "Y", "C", "a", 9],
["W", "Y", "C", "e", nan],
["V", "Q", "A", "h", -3],
["V", "R", "D", "i", 2],
["V", "R", "D", "i", -1],
["X", "Y", "D", "b", nan],
["X", "Y", "A", "c", 1],
["X", "Y", "A", "c", 4],
["W", "Q", "B", "f", 3],
["W", "Q", "B", "f", 8],
["W", "R", "C", "g", 0],
["V", "Y", "C", "j", 7],
["X", "Y", "B", "d", 5],
],
columns=["cola", "colb", "colc", "tag", "val"],
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match(self):
left = DataFrame(
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
columns=["tag", "val"],
index=[2, 0, 1, 3],
)
right = DataFrame(
[
["a", "v"],
["c", "w"],
["c", "x"],
["d", "y"],
["a", "z"],
["c", "r"],
["e", "q"],
["c", "s"],
],
columns=["tag", "char"],
).set_index("tag")
result = left.join(right, on="tag", how="left")
expected = DataFrame(
[
["c", 0, "w"],
["c", 0, "x"],
["c", 0, "r"],
["c", 0, "s"],
["b", 1, nan],
["a", 2, "v"],
["a", 2, "z"],
["b", 3, nan],
],
columns=["tag", "val", "char"],
index=[2, 2, 2, 2, 0, 1, 1, 3],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on="tag", how="left", sort=True)
expected2 = expected.sort_values("tag", kind="mergesort")
tm.assert_frame_equal(result, expected2)
# GH7331 - maintain left frame order in left merge
result = merge(left, right.reset_index(), how="left", on="tag")
expected.index = np.arange(len(expected))
tm.assert_frame_equal(result, expected)
def test_left_merge_na_buglet(self):
left = DataFrame(
{
"id": list("abcde"),
"v1": randn(5),
"v2": randn(5),
"dummy": list("abcde"),
"v3": randn(5),
},
columns=["id", "v1", "v2", "dummy", "v3"],
)
right = DataFrame(
{
"id": ["a", "b", np.nan, np.nan, np.nan],
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
}
)
result = merge(left, right, on="id", how="left")
rdf = right.drop(["id"], axis=1)
expected = left.join(rdf)
tm.assert_frame_equal(result, expected)
def test_merge_na_keys(self):
data = [
[1950, "A", 1.5],
[1950, "B", 1.5],
[1955, "B", 1.5],
[1960, "B", np.nan],
[1970, "B", 4.0],
[1950, "C", 4.0],
[1960, "C", np.nan],
[1965, "C", 3.0],
[1970, "C", 4.0],
]
frame = DataFrame(data, columns=["year", "panel", "data"])
other_data = [
[1960, "A", np.nan],
[1970, "A", np.nan],
[1955, "A", np.nan],
[1965, "A", np.nan],
[1965, "B", np.nan],
[1955, "C", np.nan],
]
other = DataFrame(other_data, columns=["year", "panel", "data"])
result = frame.merge(other, how="outer")
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
expected = expected.replace(-999, np.nan)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, klass):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if klass is not None:
on_vector = klass(on_vector)
expected = DataFrame(
OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])])
)
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame(
OrderedDict(
[("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])]
)
)
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
def test_join_multi_levels(self):
# GH 3662
# merge multi-levels
household = DataFrame(
dict(
household_id=[1, 2, 3],
male=[0, 1, 0],
wealth=[196087.3, 316478.7, 294750],
),
columns=["household_id", "male", "wealth"],
).set_index("household_id")
portfolio = DataFrame(
dict(
household_id=[1, 2, 2, 3, 3, 3, 4],
asset_id=[
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
name=[
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
np.nan,
],
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
),
columns=["household_id", "asset_id", "name", "share"],
).set_index(["household_id", "asset_id"])
result = household.join(portfolio, how="inner")
expected = (
DataFrame(
dict(
male=[0, 1, 1, 0, 0, 0],
wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0],
name=[
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
],
share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
household_id=[1, 2, 2, 3, 3, 3],
asset_id=[
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
],
)
)
.set_index(["household_id", "asset_id"])
.reindex(columns=["male", "wealth", "name", "share"])
)
tm.assert_frame_equal(result, expected)
# equivalency
result = merge(
household.reset_index(),
portfolio.reset_index(),
on=["household_id"],
how="inner",
).set_index(["household_id", "asset_id"])
tm.assert_frame_equal(result, expected)
result = household.join(portfolio, how="outer")
expected = concat(
[
expected,
(
DataFrame(
dict(share=[1.00]),
index=MultiIndex.from_tuples(
[(4, np.nan)], names=["household_id", "asset_id"]
),
)
),
],
axis=0,
sort=True,
).reindex(columns=expected.columns)
tm.assert_frame_equal(result, expected)
# invalid cases
household.index.name = "foo"
with pytest.raises(ValueError):
household.join(portfolio, how="inner")
portfolio2 = portfolio.copy()
portfolio2.index.set_names(["household_id", "foo"])
with pytest.raises(ValueError):
portfolio2.join(portfolio, how="inner")
def test_join_multi_levels2(self):
# some more advanced merges
# GH6360
household = DataFrame(
dict(
household_id=[1, 2, 2, 3, 3, 3, 4],
asset_id=[
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
),
columns=["household_id", "asset_id", "share"],
).set_index(["household_id", "asset_id"])
log_return = DataFrame(
dict(
asset_id=[
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
t=[233, 234, 235, 180, 181],
log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997],
)
).set_index(["asset_id", "t"])
expected = (
DataFrame(
dict(
household_id=[2, 2, 2, 3, 3, 3, 3, 3],
asset_id=[
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
t=[233, 234, 235, 233, 234, 235, 180, 181],
share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
log_return=[
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
],
)
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
# this is the equivalency
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="inner",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
expected = (
DataFrame(
dict(
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
asset_id=[
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
"nl0000289965",
None,
],
t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
share=[
1.0,
0.4,
0.6,
0.6,
0.6,
0.15,
0.15,
0.15,
0.6,
0.6,
0.25,
1.0,
],
log_return=[
None,
None,
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
None,
None,
],
)
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="outer",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
class TestJoinMultiMulti:
def test_join_multi_multi(
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
):
# Multi-index join tests
expected = (
pd.merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(idx_cols_multi)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
def test_join_multi_empty_frames(
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
):
left_multi = left_multi.drop(columns=left_multi.columns)
right_multi = right_multi.drop(columns=right_multi.columns)
expected = (
pd.merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(idx_cols_multi)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, box):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if box is not None:
on_vector = box(on_vector)
expected = DataFrame(
OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])])
)
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame(
OrderedDict(
[("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])]
)
)
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
def test_single_common_level(self):
index_left = pd.MultiIndex.from_tuples(
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
)
left = pd.DataFrame(
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
)
index_right = pd.MultiIndex.from_tuples(
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
)
right = pd.DataFrame(
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
index=index_right,
)
result = left.join(right)
expected = pd.merge(
left.reset_index(), right.reset_index(), on=["key"], how="inner"
).set_index(["key", "X", "Y"])
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,587 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
Series,
TimedeltaIndex,
Timestamp,
cut,
date_range,
isna,
qcut,
timedelta_range,
to_datetime,
)
from pandas.api.types import CategoricalDtype as CDT
import pandas.core.reshape.tile as tmod
import pandas.util.testing as tm
def test_simple():
data = np.ones(5, dtype="int64")
result = cut(data, 4, labels=False)
expected = np.array([1, 1, 1, 1, 1])
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
def test_bins():
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
def test_right():
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=True, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
expected = Categorical(intervals, ordered=True)
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
def test_no_right():
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=False, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
def test_array_like():
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
def test_bins_from_interval_index():
c = cut(range(5), 3)
expected = c
result = cut(range(5), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
expected = Categorical.from_codes(
np.append(c.codes, -1), categories=c.categories, ordered=True
)
result = cut(range(6), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
def test_bins_from_interval_index_doc_example():
# Make sure we preserve the bins.
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
c = cut(ages, bins=[0, 18, 35, 70])
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
tm.assert_index_equal(c.categories, expected)
result = cut([25, 20, 50], bins=c.categories)
tm.assert_index_equal(result.categories, expected)
tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
def test_bins_not_overlapping_from_interval_index():
# see gh-23980
msg = "Overlapping IntervalIndex is not accepted"
ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
with pytest.raises(ValueError, match=msg):
cut([5, 6], bins=ii)
def test_bins_not_monotonic():
msg = "bins must increase monotonically"
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
with pytest.raises(ValueError, match=msg):
cut(data, [0.1, 1.5, 1, 10])
@pytest.mark.parametrize(
"x, bins, expected",
[
(
date_range("2017-12-31", periods=3),
[Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
IntervalIndex.from_tuples(
[
(Timestamp.min, Timestamp("2018-01-01")),
(Timestamp("2018-01-01"), Timestamp.max),
]
),
),
(
[-1, 0, 1],
np.array(
[np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
),
IntervalIndex.from_tuples(
[(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
),
),
(
[np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)],
np.array(
[
np.timedelta64(-np.iinfo(np.int64).max),
np.timedelta64(0),
np.timedelta64(np.iinfo(np.int64).max),
]
),
IntervalIndex.from_tuples(
[
(np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)),
(np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)),
]
),
),
],
)
def test_bins_monotonic_not_overflowing(x, bins, expected):
# GH 26045
result = cut(x, bins)
tm.assert_index_equal(result.categories, expected)
def test_wrong_num_labels():
msg = "Bin labels must be one fewer than the number of bin edges"
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
with pytest.raises(ValueError, match=msg):
cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
@pytest.mark.parametrize(
"x,bins,msg",
[
([], 2, "Cannot cut empty array"),
([1, 2, 3], 0.5, "`bins` should be a positive integer"),
],
)
def test_cut_corner(x, bins, msg):
with pytest.raises(ValueError, match=msg):
cut(x, bins)
@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
@pytest.mark.parametrize("cut_func", [cut, qcut])
def test_cut_not_1d_arg(arg, cut_func):
msg = "Input array must be 1 dimensional"
with pytest.raises(ValueError, match=msg):
cut_func(arg, 2)
@pytest.mark.parametrize(
"data",
[
[0, 1, 2, 3, 4, np.inf],
[-np.inf, 0, 1, 2, 3, 4],
[-np.inf, 0, 1, 2, 3, 4, np.inf],
],
)
def test_int_bins_with_inf(data):
# GH 24314
msg = "cannot specify integer `bins` when input data contains infinity"
with pytest.raises(ValueError, match=msg):
cut(data, bins=3)
def test_cut_out_of_range_more():
# see gh-1511
name = "x"
ser = Series([0, -1, 0, 1, -3], name=name)
ind = cut(ser, [0, 1], labels=False)
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
tm.assert_series_equal(ind, exp)
@pytest.mark.parametrize(
"right,breaks,closed",
[
(True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
(False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
],
)
def test_labels(right, breaks, closed):
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
result, bins = cut(arr, 4, retbins=True, right=right)
ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
tm.assert_index_equal(result.categories, ex_levels)
def test_cut_pass_series_name_to_factor():
name = "foo"
ser = Series(np.random.randn(100), name=name)
factor = cut(ser, 4)
assert factor.name == name
def test_label_precision():
arr = np.arange(0, 0.73, 0.01)
result = cut(arr, 4, precision=2)
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
tm.assert_index_equal(result.categories, ex_levels)
@pytest.mark.parametrize("labels", [None, False])
def test_na_handling(labels):
arr = np.arange(0, 0.75, 0.01)
arr[::3] = np.nan
result = cut(arr, 4, labels=labels)
result = np.asarray(result)
expected = np.where(isna(arr), np.nan, result)
tm.assert_almost_equal(result, expected)
def test_inf_handling():
data = np.arange(6)
data_ser = Series(data, dtype="int64")
bins = [-np.inf, 2, 4, np.inf]
result = cut(data, bins)
result_ser = cut(data_ser, bins)
ex_uniques = IntervalIndex.from_breaks(bins)
tm.assert_index_equal(result.categories, ex_uniques)
assert result[5] == Interval(4, np.inf)
assert result[0] == Interval(-np.inf, 2)
assert result_ser[5] == Interval(4, np.inf)
assert result_ser[0] == Interval(-np.inf, 2)
def test_cut_out_of_bounds():
arr = np.random.randn(100)
result = cut(arr, [-1, 0, 1])
mask = isna(result)
ex_mask = (arr < -1) | (arr > 1)
tm.assert_numpy_array_equal(mask, ex_mask)
@pytest.mark.parametrize(
"get_labels,get_expected",
[
(
lambda labels: labels,
lambda labels: Categorical(
["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
categories=labels,
ordered=True,
),
),
(
lambda labels: Categorical.from_codes([0, 1, 2], labels),
lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
),
],
)
def test_cut_pass_labels(get_labels, get_expected):
bins = [0, 25, 50, 100]
arr = [50, 5, 10, 15, 20, 30, 70]
labels = ["Small", "Medium", "Large"]
result = cut(arr, bins, labels=get_labels(labels))
tm.assert_categorical_equal(result, get_expected(labels))
def test_cut_pass_labels_compat():
# see gh-16459
arr = [50, 5, 10, 15, 20, 30, 70]
labels = ["Good", "Medium", "Bad"]
result = cut(arr, 3, labels=labels)
exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
tm.assert_categorical_equal(result, exp)
@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
def test_round_frac_just_works(x):
# It works.
cut(x, 2)
@pytest.mark.parametrize(
"val,precision,expected",
[
(-117.9998, 3, -118),
(117.9998, 3, 118),
(117.9998, 2, 118),
(0.000123456, 2, 0.00012),
],
)
def test_round_frac(val, precision, expected):
# see gh-1979
result = tmod._round_frac(val, precision=precision)
assert result == expected
def test_cut_return_intervals():
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
result = cut(ser, 3)
exp_bins = np.linspace(0, 8, num=4).round(3)
exp_bins[0] -= 0.008
expected = Series(
IntervalIndex.from_breaks(exp_bins, closed="right").take(
[0, 0, 0, 1, 1, 1, 2, 2, 2]
)
).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
def test_series_ret_bins():
# see gh-8589
ser = Series(np.arange(4))
result, bins = cut(ser, 2, retbins=True)
expected = Series(
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
(dict(duplicates="drop"), None),
(dict(), "Bin edges must be unique"),
(dict(duplicates="raise"), "Bin edges must be unique"),
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
],
)
def test_cut_duplicates_bin(kwargs, msg):
# see gh-20947
bins = [0, 2, 4, 6, 10, 10]
values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
if msg is not None:
with pytest.raises(ValueError, match=msg):
cut(values, bins, **kwargs)
else:
result = cut(values, bins, **kwargs)
expected = cut(values, pd.unique(bins))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
@pytest.mark.parametrize("length", [1, 2])
def test_single_bin(data, length):
# see gh-14652, gh-15428
ser = Series([data] * length)
result = cut(ser, 1, labels=False)
expected = Series([0] * length)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
)
def test_cut_read_only(array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable
array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable
hundred_elements = np.arange(100)
tm.assert_categorical_equal(
cut(hundred_elements, array_1), cut(hundred_elements, array_2)
)
@pytest.mark.parametrize(
"conv",
[
lambda v: Timestamp(v),
lambda v: to_datetime(v),
lambda v: np.datetime64(v),
lambda v: Timestamp(v).to_pydatetime(),
],
)
def test_datetime_bin(conv):
data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
expected = Series(
IntervalIndex(
[
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
]
)
).astype(CDT(ordered=True))
bins = [conv(v) for v in bin_data]
result = Series(cut(data, bins=bins))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
[
np.datetime64("2013-01-01"),
np.datetime64("2013-01-02"),
np.datetime64("2013-01-03"),
],
np.array(
[
np.datetime64("2013-01-01"),
np.datetime64("2013-01-02"),
np.datetime64("2013-01-03"),
]
),
DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]),
],
)
def test_datetime_cut(data):
# see gh-14714
#
# Testing time data when it comes in various collection types.
result, _ = cut(data, 3, retbins=True)
expected = Series(
IntervalIndex(
[
Interval(
Timestamp("2012-12-31 23:57:07.200000"),
Timestamp("2013-01-01 16:00:00"),
),
Interval(
Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")
),
Interval(
Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00")
),
]
)
).astype(CDT(ordered=True))
tm.assert_series_equal(Series(result), expected)
@pytest.mark.parametrize(
"bins",
[
3,
[
Timestamp("2013-01-01 04:57:07.200000"),
Timestamp("2013-01-01 21:00:00"),
Timestamp("2013-01-02 13:00:00"),
Timestamp("2013-01-03 05:00:00"),
],
],
)
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
def test_datetime_tz_cut(bins, box):
# see gh-19872
tz = "US/Eastern"
s = Series(date_range("20130101", periods=3, tz=tz))
if not isinstance(bins, int):
bins = box(bins)
result = cut(s, bins)
expected = Series(
IntervalIndex(
[
Interval(
Timestamp("2012-12-31 23:57:07.200000", tz=tz),
Timestamp("2013-01-01 16:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-01 16:00:00", tz=tz),
Timestamp("2013-01-02 08:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-02 08:00:00", tz=tz),
Timestamp("2013-01-03 00:00:00", tz=tz),
),
]
)
).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
def test_datetime_nan_error():
msg = "bins must be of datetime64 dtype"
with pytest.raises(ValueError, match=msg):
cut(date_range("20130101", periods=3), bins=[0, 2, 4])
def test_datetime_nan_mask():
result = cut(
date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
)
mask = result.categories.isna()
tm.assert_numpy_array_equal(mask, np.array([False]))
mask = result.isna()
tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
def test_datetime_cut_roundtrip(tz):
# see gh-19891
ser = Series(date_range("20180101", periods=3, tz=tz))
result, result_bins = cut(ser, 2, retbins=True)
expected = cut(ser, result_bins)
tm.assert_series_equal(result, expected)
expected_bins = DatetimeIndex(
["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"]
)
expected_bins = expected_bins.tz_localize(tz)
tm.assert_index_equal(result_bins, expected_bins)
def test_timedelta_cut_roundtrip():
# see gh-19891
ser = Series(timedelta_range("1day", periods=3))
result, result_bins = cut(ser, 2, retbins=True)
expected = cut(ser, result_bins)
tm.assert_series_equal(result, expected)
expected_bins = TimedeltaIndex(
["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
)
tm.assert_index_equal(result_bins, expected_bins)

View File

@@ -0,0 +1,964 @@
import numpy as np
from numpy import nan
import pytest
import pandas as pd
from pandas import DataFrame, lreshape, melt, wide_to_long
import pandas.util.testing as tm
class TestMelt:
def setup_method(self, method):
self.df = tm.makeTimeDataFrame()[:10]
self.df["id1"] = (self.df["A"] > 0).astype(np.int64)
self.df["id2"] = (self.df["B"] > 0).astype(np.int64)
self.var_name = "var"
self.value_name = "val"
self.df1 = pd.DataFrame(
[
[1.067683, -1.110463, 0.20867],
[-1.321405, 0.368915, -1.055342],
[-0.807333, 0.08298, -0.873361],
]
)
self.df1.columns = [list("ABC"), list("abc")]
self.df1.columns.names = ["CAP", "low"]
def test_top_level_method(self):
result = melt(self.df)
assert result.columns.tolist() == ["variable", "value"]
def test_method_signatures(self):
tm.assert_frame_equal(self.df.melt(), melt(self.df))
tm.assert_frame_equal(
self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]),
melt(self.df, id_vars=["id1", "id2"], value_vars=["A", "B"]),
)
tm.assert_frame_equal(
self.df.melt(var_name=self.var_name, value_name=self.value_name),
melt(self.df, var_name=self.var_name, value_name=self.value_name),
)
tm.assert_frame_equal(self.df1.melt(col_level=0), melt(self.df1, col_level=0))
def test_default_col_names(self):
result = self.df.melt()
assert result.columns.tolist() == ["variable", "value"]
result1 = self.df.melt(id_vars=["id1"])
assert result1.columns.tolist() == ["id1", "variable", "value"]
result2 = self.df.melt(id_vars=["id1", "id2"])
assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
def test_value_vars(self):
result3 = self.df.melt(id_vars=["id1", "id2"], value_vars="A")
assert len(result3) == 10
result4 = self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"])
expected4 = DataFrame(
{
"id1": self.df["id1"].tolist() * 2,
"id2": self.df["id2"].tolist() * 2,
"variable": ["A"] * 10 + ["B"] * 10,
"value": (self.df["A"].tolist() + self.df["B"].tolist()),
},
columns=["id1", "id2", "variable", "value"],
)
tm.assert_frame_equal(result4, expected4)
def test_value_vars_types(self):
# GH 15348
expected = DataFrame(
{
"id1": self.df["id1"].tolist() * 2,
"id2": self.df["id2"].tolist() * 2,
"variable": ["A"] * 10 + ["B"] * 10,
"value": (self.df["A"].tolist() + self.df["B"].tolist()),
},
columns=["id1", "id2", "variable", "value"],
)
for type_ in (tuple, list, np.array):
result = self.df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B")))
tm.assert_frame_equal(result, expected)
def test_vars_work_with_multiindex(self):
expected = DataFrame(
{
("A", "a"): self.df1[("A", "a")],
"CAP": ["B"] * len(self.df1),
"low": ["b"] * len(self.df1),
"value": self.df1[("B", "b")],
},
columns=[("A", "a"), "CAP", "low", "value"],
)
result = self.df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")])
tm.assert_frame_equal(result, expected)
def test_single_vars_work_with_multiindex(self):
expected = DataFrame(
{
"A": {0: 1.067683, 1: -1.321405, 2: -0.807333},
"CAP": {0: "B", 1: "B", 2: "B"},
"value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
}
)
result = self.df1.melt(["A"], ["B"], col_level=0)
tm.assert_frame_equal(result, expected)
def test_tuple_vars_fail_with_multiindex(self):
# melt should fail with an informative error message if
# the columns have a MultiIndex and a tuple is passed
# for id_vars or value_vars.
tuple_a = ("A", "a")
list_a = [tuple_a]
tuple_b = ("B", "b")
list_b = [tuple_b]
msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex"
for id_vars, value_vars in (
(tuple_a, list_b),
(list_a, tuple_b),
(tuple_a, tuple_b),
):
with pytest.raises(ValueError, match=msg):
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
def test_custom_var_name(self):
result5 = self.df.melt(var_name=self.var_name)
assert result5.columns.tolist() == ["var", "value"]
result6 = self.df.melt(id_vars=["id1"], var_name=self.var_name)
assert result6.columns.tolist() == ["id1", "var", "value"]
result7 = self.df.melt(id_vars=["id1", "id2"], var_name=self.var_name)
assert result7.columns.tolist() == ["id1", "id2", "var", "value"]
result8 = self.df.melt(
id_vars=["id1", "id2"], value_vars="A", var_name=self.var_name
)
assert result8.columns.tolist() == ["id1", "id2", "var", "value"]
result9 = self.df.melt(
id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=self.var_name
)
expected9 = DataFrame(
{
"id1": self.df["id1"].tolist() * 2,
"id2": self.df["id2"].tolist() * 2,
self.var_name: ["A"] * 10 + ["B"] * 10,
"value": (self.df["A"].tolist() + self.df["B"].tolist()),
},
columns=["id1", "id2", self.var_name, "value"],
)
tm.assert_frame_equal(result9, expected9)
def test_custom_value_name(self):
result10 = self.df.melt(value_name=self.value_name)
assert result10.columns.tolist() == ["variable", "val"]
result11 = self.df.melt(id_vars=["id1"], value_name=self.value_name)
assert result11.columns.tolist() == ["id1", "variable", "val"]
result12 = self.df.melt(id_vars=["id1", "id2"], value_name=self.value_name)
assert result12.columns.tolist() == ["id1", "id2", "variable", "val"]
result13 = self.df.melt(
id_vars=["id1", "id2"], value_vars="A", value_name=self.value_name
)
assert result13.columns.tolist() == ["id1", "id2", "variable", "val"]
result14 = self.df.melt(
id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=self.value_name
)
expected14 = DataFrame(
{
"id1": self.df["id1"].tolist() * 2,
"id2": self.df["id2"].tolist() * 2,
"variable": ["A"] * 10 + ["B"] * 10,
self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()),
},
columns=["id1", "id2", "variable", self.value_name],
)
tm.assert_frame_equal(result14, expected14)
def test_custom_var_and_value_name(self):
result15 = self.df.melt(var_name=self.var_name, value_name=self.value_name)
assert result15.columns.tolist() == ["var", "val"]
result16 = self.df.melt(
id_vars=["id1"], var_name=self.var_name, value_name=self.value_name
)
assert result16.columns.tolist() == ["id1", "var", "val"]
result17 = self.df.melt(
id_vars=["id1", "id2"], var_name=self.var_name, value_name=self.value_name
)
assert result17.columns.tolist() == ["id1", "id2", "var", "val"]
result18 = self.df.melt(
id_vars=["id1", "id2"],
value_vars="A",
var_name=self.var_name,
value_name=self.value_name,
)
assert result18.columns.tolist() == ["id1", "id2", "var", "val"]
result19 = self.df.melt(
id_vars=["id1", "id2"],
value_vars=["A", "B"],
var_name=self.var_name,
value_name=self.value_name,
)
expected19 = DataFrame(
{
"id1": self.df["id1"].tolist() * 2,
"id2": self.df["id2"].tolist() * 2,
self.var_name: ["A"] * 10 + ["B"] * 10,
self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()),
},
columns=["id1", "id2", self.var_name, self.value_name],
)
tm.assert_frame_equal(result19, expected19)
df20 = self.df.copy()
df20.columns.name = "foo"
result20 = df20.melt()
assert result20.columns.tolist() == ["foo", "value"]
def test_col_level(self):
res1 = self.df1.melt(col_level=0)
res2 = self.df1.melt(col_level="CAP")
assert res1.columns.tolist() == ["CAP", "value"]
assert res2.columns.tolist() == ["CAP", "value"]
def test_multiindex(self):
res = self.df1.melt()
assert res.columns.tolist() == ["CAP", "low", "value"]
@pytest.mark.parametrize(
"col",
[
pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")),
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
pd.Series([0, 1, 0, 0, 0]),
],
)
def test_pandas_dtypes(self, col):
# GH 15785
df = DataFrame(
{"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col}
)
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True)
result = melt(
df, id_vars=["klass", "col"], var_name="attribute", value_name="value"
)
expected = DataFrame(
{
0: list(range(5)) * 2,
1: pd.concat([col] * 2, ignore_index=True),
2: ["attr1"] * 5 + ["attr2"] * 5,
3: expected_value,
}
)
expected.columns = ["klass", "col", "attribute", "value"]
tm.assert_frame_equal(result, expected)
def test_melt_missing_columns_raises(self):
# GH-23575
# This test is to ensure that pandas raises an error if melting is
# attempted with column names absent from the dataframe
# Generate data
df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd"))
# Try to melt with missing `value_vars` column name
msg = "The following '{Var}' are not present in the DataFrame: {Col}"
with pytest.raises(
KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]")
):
df.melt(["a", "b"], ["C", "d"])
# Try to melt with missing `id_vars` column name
with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")):
df.melt(["A", "b"], ["c", "d"])
# Multiple missing
with pytest.raises(
KeyError,
match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"),
):
df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])
# Multiindex melt fails if column is missing from multilevel melt
multi = df.copy()
multi.columns = [list("ABCD"), list("abcd")]
with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")):
multi.melt([("E", "a")], [("B", "b")])
# Multiindex fails if column is missing from single level melt
with pytest.raises(
KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]")
):
multi.melt(["A"], ["F"], col_level=0)
class TestLreshape:
def test_pairs(self):
data = {
"birthdt": [
"08jan2009",
"20dec2008",
"30dec2008",
"21dec2008",
"11jan2009",
],
"birthwt": [1766, 3301, 1454, 3139, 4133],
"id": [101, 102, 103, 104, 105],
"sex": ["Male", "Female", "Female", "Female", "Female"],
"visitdt1": [
"11jan2009",
"22dec2008",
"04jan2009",
"29dec2008",
"20jan2009",
],
"visitdt2": ["21jan2009", nan, "22jan2009", "31dec2008", "03feb2009"],
"visitdt3": ["05feb2009", nan, nan, "02jan2009", "15feb2009"],
"wt1": [1823, 3338, 1549, 3298, 4306],
"wt2": [2011.0, nan, 1892.0, 3338.0, 4575.0],
"wt3": [2293.0, nan, nan, 3377.0, 4805.0],
}
df = DataFrame(data)
spec = {
"visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 4)],
"wt": ["wt{i:d}".format(i=i) for i in range(1, 4)],
}
result = lreshape(df, spec)
exp_data = {
"birthdt": [
"08jan2009",
"20dec2008",
"30dec2008",
"21dec2008",
"11jan2009",
"08jan2009",
"30dec2008",
"21dec2008",
"11jan2009",
"08jan2009",
"21dec2008",
"11jan2009",
],
"birthwt": [
1766,
3301,
1454,
3139,
4133,
1766,
1454,
3139,
4133,
1766,
3139,
4133,
],
"id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
"sex": [
"Male",
"Female",
"Female",
"Female",
"Female",
"Male",
"Female",
"Female",
"Female",
"Male",
"Female",
"Female",
],
"visitdt": [
"11jan2009",
"22dec2008",
"04jan2009",
"29dec2008",
"20jan2009",
"21jan2009",
"22jan2009",
"31dec2008",
"03feb2009",
"05feb2009",
"02jan2009",
"15feb2009",
],
"wt": [
1823.0,
3338.0,
1549.0,
3298.0,
4306.0,
2011.0,
1892.0,
3338.0,
4575.0,
2293.0,
3377.0,
4805.0,
],
}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
result = lreshape(df, spec, dropna=False)
exp_data = {
"birthdt": [
"08jan2009",
"20dec2008",
"30dec2008",
"21dec2008",
"11jan2009",
"08jan2009",
"20dec2008",
"30dec2008",
"21dec2008",
"11jan2009",
"08jan2009",
"20dec2008",
"30dec2008",
"21dec2008",
"11jan2009",
],
"birthwt": [
1766,
3301,
1454,
3139,
4133,
1766,
3301,
1454,
3139,
4133,
1766,
3301,
1454,
3139,
4133,
],
"id": [
101,
102,
103,
104,
105,
101,
102,
103,
104,
105,
101,
102,
103,
104,
105,
],
"sex": [
"Male",
"Female",
"Female",
"Female",
"Female",
"Male",
"Female",
"Female",
"Female",
"Female",
"Male",
"Female",
"Female",
"Female",
"Female",
],
"visitdt": [
"11jan2009",
"22dec2008",
"04jan2009",
"29dec2008",
"20jan2009",
"21jan2009",
nan,
"22jan2009",
"31dec2008",
"03feb2009",
"05feb2009",
nan,
nan,
"02jan2009",
"15feb2009",
],
"wt": [
1823.0,
3338.0,
1549.0,
3298.0,
4306.0,
2011.0,
nan,
1892.0,
3338.0,
4575.0,
2293.0,
nan,
nan,
3377.0,
4805.0,
],
}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
spec = {
"visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 3)],
"wt": ["wt{i:d}".format(i=i) for i in range(1, 4)],
}
msg = "All column lists must be same length"
with pytest.raises(ValueError, match=msg):
lreshape(df, spec)
class TestWideToLong:
def test_simple(self):
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame(
{
"A1970": {0: "a", 1: "b", 2: "c"},
"A1980": {0: "d", 1: "e", 2: "f"},
"B1970": {0: 2.5, 1: 1.2, 2: 0.7},
"B1980": {0: 3.2, 1: 1.3, 2: 0.1},
"X": dict(zip(range(3), x)),
}
)
df["id"] = df.index
exp_data = {
"X": x.tolist() + x.tolist(),
"A": ["a", "b", "c", "d", "e", "f"],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2],
}
expected = DataFrame(exp_data)
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_stubs(self):
# GH9204
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
stubs = ["inc", "edu"]
# TODO: unused?
df_long = pd.wide_to_long(df, stubs, i="id", j="age") # noqa
assert stubs == ["inc", "edu"]
def test_separating_character(self):
# GH14779
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame(
{
"A.1970": {0: "a", 1: "b", 2: "c"},
"A.1980": {0: "d", 1: "e", 2: "f"},
"B.1970": {0: 2.5, 1: 1.2, 2: 0.7},
"B.1980": {0: 3.2, 1: 1.3, 2: 0.1},
"X": dict(zip(range(3), x)),
}
)
df["id"] = df.index
exp_data = {
"X": x.tolist() + x.tolist(),
"A": ["a", "b", "c", "d", "e", "f"],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2],
}
expected = DataFrame(exp_data)
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
tm.assert_frame_equal(result, expected)
def test_escapable_characters(self):
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame(
{
"A(quarterly)1970": {0: "a", 1: "b", 2: "c"},
"A(quarterly)1980": {0: "d", 1: "e", 2: "f"},
"B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7},
"B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1},
"X": dict(zip(range(3), x)),
}
)
df["id"] = df.index
exp_data = {
"X": x.tolist() + x.tolist(),
"A(quarterly)": ["a", "b", "c", "d", "e", "f"],
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2],
}
expected = DataFrame(exp_data)
expected = expected.set_index(["id", "year"])[
["X", "A(quarterly)", "B(quarterly)"]
]
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_unbalanced(self):
# test that we can have a varying amount of time variables
df = pd.DataFrame(
{
"A2010": [1.0, 2.0],
"A2011": [3.0, 4.0],
"B2010": [5.0, 6.0],
"X": ["X1", "X2"],
}
)
df["id"] = df.index
exp_data = {
"X": ["X1", "X1", "X2", "X2"],
"A": [1.0, 3.0, 2.0, 4.0],
"B": [5.0, np.nan, 6.0, np.nan],
"id": [0, 0, 1, 1],
"year": [2010, 2011, 2010, 2011],
}
expected = pd.DataFrame(exp_data)
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_character_overlap(self):
# Test we handle overlapping characters in both id_vars and value_vars
df = pd.DataFrame(
{
"A11": ["a11", "a22", "a33"],
"A12": ["a21", "a22", "a23"],
"B11": ["b11", "b12", "b13"],
"B12": ["b21", "b22", "b23"],
"BB11": [1, 2, 3],
"BB12": [4, 5, 6],
"BBBX": [91, 92, 93],
"BBBZ": [91, 92, 93],
}
)
df["id"] = df.index
expected = pd.DataFrame(
{
"BBBX": [91, 92, 93, 91, 92, 93],
"BBBZ": [91, 92, 93, 91, 92, 93],
"A": ["a11", "a22", "a33", "a21", "a22", "a23"],
"B": ["b11", "b12", "b13", "b21", "b22", "b23"],
"BB": [1, 2, 3, 4, 5, 6],
"id": [0, 1, 2, 0, 1, 2],
"year": [11, 11, 11, 12, 12, 12],
}
)
expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]]
result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_invalid_separator(self):
# if an invalid separator is supplied a empty data frame is returned
sep = "nope!"
df = pd.DataFrame(
{
"A2010": [1.0, 2.0],
"A2011": [3.0, 4.0],
"B2010": [5.0, 6.0],
"X": ["X1", "X2"],
}
)
df["id"] = df.index
exp_data = {
"X": "",
"A2010": [],
"A2011": [],
"B2010": [],
"id": [],
"year": [],
"A": [],
"B": [],
}
expected = pd.DataFrame(exp_data).astype({"year": "int"})
expected = expected.set_index(["id", "year"])[
["X", "A2010", "A2011", "B2010", "A", "B"]
]
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_num_string_disambiguation(self):
# Test that we can disambiguate number value_vars from
# string value_vars
df = pd.DataFrame(
{
"A11": ["a11", "a22", "a33"],
"A12": ["a21", "a22", "a23"],
"B11": ["b11", "b12", "b13"],
"B12": ["b21", "b22", "b23"],
"BB11": [1, 2, 3],
"BB12": [4, 5, 6],
"Arating": [91, 92, 93],
"Arating_old": [91, 92, 93],
}
)
df["id"] = df.index
expected = pd.DataFrame(
{
"Arating": [91, 92, 93, 91, 92, 93],
"Arating_old": [91, 92, 93, 91, 92, 93],
"A": ["a11", "a22", "a33", "a21", "a22", "a23"],
"B": ["b11", "b12", "b13", "b21", "b22", "b23"],
"BB": [1, 2, 3, 4, 5, 6],
"id": [0, 1, 2, 0, 1, 2],
"year": [11, 11, 11, 12, 12, 12],
}
)
expected = expected.set_index(["id", "year"])[
["Arating", "Arating_old", "A", "B", "BB"]
]
result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_invalid_suffixtype(self):
# If all stubs names end with a string, but a numeric suffix is
# assumed, an empty data frame is returned
df = pd.DataFrame(
{
"Aone": [1.0, 2.0],
"Atwo": [3.0, 4.0],
"Bone": [5.0, 6.0],
"X": ["X1", "X2"],
}
)
df["id"] = df.index
exp_data = {
"X": "",
"Aone": [],
"Atwo": [],
"Bone": [],
"id": [],
"year": [],
"A": [],
"B": [],
}
expected = pd.DataFrame(exp_data).astype({"year": "int"})
expected = expected.set_index(["id", "year"])
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_multiple_id_columns(self):
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
df = pd.DataFrame(
{
"famid": [1, 1, 1, 2, 2, 2, 3, 3, 3],
"birth": [1, 2, 3, 1, 2, 3, 1, 2, 3],
"ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
"ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9],
}
)
expected = pd.DataFrame(
{
"ht": [
2.8,
3.4,
2.9,
3.8,
2.2,
2.9,
2.0,
3.2,
1.8,
2.8,
1.9,
2.4,
2.2,
3.3,
2.3,
3.4,
2.1,
2.9,
],
"famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
"birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
"age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
}
)
expected = expected.set_index(["famid", "birth", "age"])[["ht"]]
result = wide_to_long(df, "ht", i=["famid", "birth"], j="age")
tm.assert_frame_equal(result, expected)
def test_non_unique_idvars(self):
# GH16382
# Raise an error message if non unique id vars (i) are passed
df = pd.DataFrame(
{"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]}
)
msg = "the id variables need to uniquely identify each row"
with pytest.raises(ValueError, match=msg):
wide_to_long(df, ["A_A", "B_B"], i="x", j="colname")
def test_cast_j_int(self):
df = pd.DataFrame(
{
"actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"],
"actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"],
"actor_fb_likes_1": [1000.0, 40000.0, 11000.0],
"actor_fb_likes_2": [936.0, 5000.0, 393.0],
"title": ["Avatar", "Pirates of the Caribbean", "Spectre"],
}
)
expected = pd.DataFrame(
{
"actor": [
"CCH Pounder",
"Johnny Depp",
"Christoph Waltz",
"Joel David Moore",
"Orlando Bloom",
"Rory Kinnear",
],
"actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
"num": [1, 1, 1, 2, 2, 2],
"title": [
"Avatar",
"Pirates of the Caribbean",
"Spectre",
"Avatar",
"Pirates of the Caribbean",
"Spectre",
],
}
).set_index(["title", "num"])
result = wide_to_long(
df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_"
)
tm.assert_frame_equal(result, expected)
def test_identical_stubnames(self):
df = pd.DataFrame(
{
"A2010": [1.0, 2.0],
"A2011": [3.0, 4.0],
"B2010": [5.0, 6.0],
"A": ["X1", "X2"],
}
)
msg = "stubname can't be identical to a column name"
with pytest.raises(ValueError, match=msg):
wide_to_long(df, ["A", "B"], i="A", j="colname")
def test_nonnumeric_suffix(self):
df = pd.DataFrame(
{
"treatment_placebo": [1.0, 2.0],
"treatment_test": [3.0, 4.0],
"result_placebo": [5.0, 6.0],
"A": ["X1", "X2"],
}
)
expected = pd.DataFrame(
{
"A": ["X1", "X1", "X2", "X2"],
"colname": ["placebo", "test", "placebo", "test"],
"result": [5.0, np.nan, 6.0, np.nan],
"treatment": [1.0, 3.0, 2.0, 4.0],
}
)
expected = expected.set_index(["A", "colname"])
result = wide_to_long(
df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_"
)
tm.assert_frame_equal(result, expected)
def test_mixed_type_suffix(self):
df = pd.DataFrame(
{
"A": ["X1", "X2"],
"result_1": [0, 9],
"result_foo": [5.0, 6.0],
"treatment_1": [1.0, 2.0],
"treatment_foo": [3.0, 4.0],
}
)
expected = pd.DataFrame(
{
"A": ["X1", "X2", "X1", "X2"],
"colname": ["1", "1", "foo", "foo"],
"result": [0.0, 9.0, 5.0, 6.0],
"treatment": [1.0, 2.0, 3.0, 4.0],
}
).set_index(["A", "colname"])
result = wide_to_long(
df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_"
)
tm.assert_frame_equal(result, expected)
def test_float_suffix(self):
df = pd.DataFrame(
{
"treatment_1.1": [1.0, 2.0],
"treatment_2.1": [3.0, 4.0],
"result_1.2": [5.0, 6.0],
"result_1": [0, 9],
"A": ["X1", "X2"],
}
)
expected = pd.DataFrame(
{
"A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"],
"colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
"result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
"treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0],
}
)
expected = expected.set_index(["A", "colname"])
result = wide_to_long(
df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_"
)
tm.assert_frame_equal(result, expected)
def test_col_substring_of_stubname(self):
# GH22468
# Don't raise ValueError when a column name is a substring
# of a stubname that's been passed as a string
wide_data = {
"node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
"A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
"PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
"PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
"PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67},
}
wide_df = pd.DataFrame.from_dict(wide_data)
expected = pd.wide_to_long(
wide_df, stubnames=["PA"], i=["node_id", "A"], j="time"
)
result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,238 @@
import os
import numpy as np
import pytest
from pandas import (
Categorical,
DatetimeIndex,
Interval,
IntervalIndex,
NaT,
Series,
TimedeltaIndex,
Timestamp,
cut,
date_range,
isna,
qcut,
timedelta_range,
)
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.algorithms import quantile
import pandas.util.testing as tm
from pandas.tseries.offsets import Day, Nano
def test_qcut():
arr = np.random.randn(1000)
# We store the bins as Index that have been
# rounded to comparisons are a bit tricky.
labels, bins = qcut(arr, 4, retbins=True)
ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
result = labels.categories.left.values
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
result = labels.categories.right.values
assert np.allclose(result, ex_bins[1:], atol=1e-2)
ex_levels = cut(arr, ex_bins, include_lowest=True)
tm.assert_categorical_equal(labels, ex_levels)
def test_qcut_bounds():
arr = np.random.randn(1000)
factor = qcut(arr, 10, labels=False)
assert len(np.unique(factor)) == 10
def test_qcut_specify_quantiles():
arr = np.random.randn(100)
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
expected = qcut(arr, 4)
tm.assert_categorical_equal(factor, expected)
def test_qcut_all_bins_same():
with pytest.raises(ValueError, match="edges.*unique"):
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
def test_qcut_include_lowest():
values = np.arange(10)
ii = qcut(values, 4)
ex_levels = IntervalIndex(
[
Interval(-0.001, 2.25),
Interval(2.25, 4.5),
Interval(4.5, 6.75),
Interval(6.75, 9),
]
)
tm.assert_index_equal(ii.categories, ex_levels)
def test_qcut_nas():
arr = np.random.randn(100)
arr[:20] = np.nan
result = qcut(arr, 4)
assert isna(result[:20]).all()
def test_qcut_index():
result = qcut([0, 2], 2)
intervals = [Interval(-0.001, 1), Interval(1, 2)]
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
def test_qcut_binning_issues(datapath):
# see gh-1978, gh-1979
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
arr = np.loadtxt(cut_file)
result = qcut(arr, 20)
starts = []
ends = []
for lev in np.unique(result):
s = lev.left
e = lev.right
assert s != e
starts.append(float(s))
ends.append(float(e))
for (sp, sn), (ep, en) in zip(
zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
):
assert sp < sn
assert ep < en
assert ep <= sn
def test_qcut_return_intervals():
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
res = qcut(ser, [0, 0.333, 0.666, 1])
exp_levels = np.array(
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
)
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
tm.assert_series_equal(res, exp)
@pytest.mark.parametrize(
"kwargs,msg",
[
(dict(duplicates="drop"), None),
(dict(), "Bin edges must be unique"),
(dict(duplicates="raise"), "Bin edges must be unique"),
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
],
)
def test_qcut_duplicates_bin(kwargs, msg):
# see gh-7751
values = [0, 0, 0, 0, 1, 2, 3]
if msg is not None:
with pytest.raises(ValueError, match=msg):
qcut(values, 3, **kwargs)
else:
result = qcut(values, 3, **kwargs)
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
tm.assert_index_equal(result.categories, expected)
@pytest.mark.parametrize(
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
)
@pytest.mark.parametrize("length", [1, 2])
@pytest.mark.parametrize("labels", [None, False])
def test_single_quantile(data, start, end, length, labels):
# see gh-15431
ser = Series([data] * length)
result = qcut(ser, 1, labels=labels)
if labels is None:
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
expected = Series(intervals).astype(CDT(ordered=True))
else:
expected = Series([0] * length)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"ser",
[
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
],
ids=lambda x: str(x.dtype),
)
def test_qcut_nat(ser):
# see gh-19768
intervals = IntervalIndex.from_tuples(
[(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
)
expected = Series(Categorical(intervals, ordered=True))
result = qcut(ser, 2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
def test_datetime_tz_qcut(bins):
# see gh-19872
tz = "US/Eastern"
ser = Series(date_range("20130101", periods=3, tz=tz))
result = qcut(ser, bins)
expected = Series(
IntervalIndex(
[
Interval(
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
Timestamp("2013-01-01 16:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-01 16:00:00", tz=tz),
Timestamp("2013-01-02 08:00:00", tz=tz),
),
Interval(
Timestamp("2013-01-02 08:00:00", tz=tz),
Timestamp("2013-01-03 00:00:00", tz=tz),
),
]
)
).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"arg,expected_bins",
[
[
timedelta_range("1day", periods=3),
TimedeltaIndex(["1 days", "2 days", "3 days"]),
],
[
date_range("20180101", periods=3),
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
],
],
)
def test_date_like_qcut_bins(arg, expected_bins):
# see gh-19891
ser = Series(arg)
result, result_bins = qcut(ser, 2, retbins=True)
tm.assert_index_equal(result_bins, expected_bins)

View File

@@ -0,0 +1,653 @@
from collections import OrderedDict
import numpy as np
from numpy import nan
import pytest
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
from pandas import Categorical, DataFrame, Index, Series, get_dummies
from pandas.core.sparse.api import SparseArray, SparseDtype
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
class TestGetDummies:
@pytest.fixture
def df(self):
return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
@pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
def dtype(self, request):
return np.dtype(request.param)
@pytest.fixture(params=["dense", "sparse"])
def sparse(self, request):
# params are strings to simplify reading test results,
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
return request.param == "sparse"
def effective_dtype(self, dtype):
if dtype is None:
return np.uint8
return dtype
def test_raises_on_dtype_object(self, df):
with pytest.raises(ValueError):
get_dummies(df, dtype="object")
def test_basic(self, sparse, dtype):
s_list = list("abc")
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))
expected = DataFrame(
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
dtype=self.effective_dtype(dtype),
)
if sparse:
expected = expected.apply(pd.SparseArray, fill_value=0.0)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)
expected.index = list("ABC")
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)
def test_basic_types(self, sparse, dtype):
# GH 10531
s_list = list("abc")
s_series = Series(s_list)
s_df = DataFrame(
{"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
)
expected = DataFrame(
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
dtype=self.effective_dtype(dtype),
columns=list("abc"),
)
if sparse:
if is_integer_dtype(dtype):
fill_value = 0
elif dtype == bool:
fill_value = False
else:
fill_value = 0.0
expected = expected.apply(SparseArray, fill_value=fill_value)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)
result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
if sparse:
dtype_name = "Sparse[{}, {}]".format(
self.effective_dtype(dtype).name, fill_value
)
else:
dtype_name = self.effective_dtype(dtype).name
expected = Series({dtype_name: 8})
result = result.dtypes.value_counts()
result.index = [str(i) for i in result.index]
tm.assert_series_equal(result, expected)
result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
expected_counts = {"int64": 1, "object": 1}
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
expected = Series(expected_counts).sort_index()
result = result.dtypes.value_counts()
result.index = [str(i) for i in result.index]
result = result.sort_index()
tm.assert_series_equal(result, expected)
def test_just_na(self, sparse):
just_na_list = [np.nan]
just_na_series = Series(just_na_list)
just_na_series_index = Series(just_na_list, index=["A"])
res_list = get_dummies(just_na_list, sparse=sparse)
res_series = get_dummies(just_na_series, sparse=sparse)
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
assert res_list.empty
assert res_series.empty
assert res_series_index.empty
assert res_list.index.tolist() == [0]
assert res_series.index.tolist() == [0]
assert res_series_index.index.tolist() == ["A"]
def test_include_na(self, sparse, dtype):
s = ["a", "b", np.nan]
res = get_dummies(s, sparse=sparse, dtype=dtype)
exp = DataFrame(
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
)
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0.0)
assert_frame_equal(res, exp)
# Sparse dataframes do not allow nan labelled columns, see #GH8822
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
exp_na = DataFrame(
{nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
dtype=self.effective_dtype(dtype),
)
exp_na = exp_na.reindex(["a", "b", nan], axis=1)
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
if sparse:
exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
assert_frame_equal(res_na, exp_na)
res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype)
exp_just_na = DataFrame(
Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)
)
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_unicode(self, sparse):
# See GH 6885 - get_dummies chokes on unicode values
import unicodedata
e = "e"
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
exp = DataFrame(
{"letter_e": [1, 0, 0], "letter_{eacute}".format(eacute=eacute): [0, 1, 1]},
dtype=np.uint8,
)
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res, exp)
def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, sparse=sparse)
expected = DataFrame(
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
dtype=np.uint8,
)
if sparse:
expected = pd.DataFrame(
{
"A_a": pd.SparseArray([1, 0, 1], dtype="uint8"),
"A_b": pd.SparseArray([0, 1, 0], dtype="uint8"),
"B_b": pd.SparseArray([1, 1, 0], dtype="uint8"),
"B_c": pd.SparseArray([0, 0, 1], dtype="uint8"),
}
)
assert_frame_equal(result, expected)
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype)
if sparse:
arr = SparseArray
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
expected = DataFrame(
{
"C": [1, 2, 3],
"A_a": arr([1, 0, 1], dtype=typ),
"A_b": arr([0, 1, 0], dtype=typ),
"B_b": arr([1, 1, 0], dtype=typ),
"B_c": arr([0, 0, 1], dtype=typ),
}
)
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_list(self, df, sparse):
prefixes = ["from_A", "from_B"]
result = get_dummies(df, prefix=prefixes, sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
"from_B_b": [1, 1, 0],
"from_B_c": [0, 0, 1],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected = expected[["C"] + cols]
typ = pd.SparseArray if sparse else pd.Series
expected[cols] = expected[cols].apply(lambda x: typ(x))
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_str(self, df, sparse):
# not that you should do this...
result = get_dummies(df, prefix="bad", sparse=sparse)
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
columns=["C"] + bad_columns,
dtype=np.uint8,
)
expected = expected.astype({"C": np.int64})
if sparse:
# work around astyping & assigning with duplicate columns
# https://github.com/pandas-dev/pandas/issues/14427
expected = pd.concat(
[
pd.Series([1, 2, 3], name="C"),
pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"),
pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"),
],
axis=1,
)
assert_frame_equal(result, expected)
def test_dataframe_dummies_subset(self, df, sparse):
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
expected = DataFrame(
{
"B": ["b", "b", "c"],
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
if sparse:
cols = ["from_A_a", "from_A_b"]
expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep="..", sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"A..a": [1, 0, 1],
"A..b": [0, 1, 0],
"B..b": [1, 1, 0],
"B..c": [0, 0, 1],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
if sparse:
cols = ["A..a", "A..b", "B..b", "B..c"]
expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
assert_frame_equal(result, expected)
result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
assert_frame_equal(result, expected)
result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
assert_frame_equal(result, expected)
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
with pytest.raises(ValueError):
get_dummies(df, prefix=["too few"], sparse=sparse)
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
with pytest.raises(ValueError):
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
def test_dataframe_dummies_prefix_dict(self, sparse):
prefixes = {"A": "from_A", "B": "from_B"}
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
result = get_dummies(df, prefix=prefixes, sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
"from_B_b": [1, 1, 0],
"from_B_c": [0, 0, 1],
}
)
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected[columns] = expected[columns].astype(np.uint8)
if sparse:
expected[columns] = expected[columns].apply(lambda x: pd.SparseSeries(x))
assert_frame_equal(result, expected)
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
axis=1
)
if sparse:
arr = SparseArray
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
expected = DataFrame(
{
"C": [1, 2, 3, np.nan],
"A_a": arr([1, 0, 1, 0], dtype=typ),
"A_b": arr([0, 1, 0, 0], dtype=typ),
"A_nan": arr([0, 0, 0, 1], dtype=typ),
"B_b": arr([1, 1, 0, 0], dtype=typ),
"B_c": arr([0, 0, 1, 0], dtype=typ),
"B_nan": arr([0, 0, 0, 1], dtype=typ),
}
).sort_index(axis=1)
assert_frame_equal(result, expected)
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
assert_frame_equal(result, expected)
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
df["cat"] = pd.Categorical(["x", "y", "y"])
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
if sparse:
arr = SparseArray
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
expected = DataFrame(
{
"C": [1, 2, 3],
"A_a": arr([1, 0, 1], dtype=typ),
"A_b": arr([0, 1, 0], dtype=typ),
"B_b": arr([1, 1, 0], dtype=typ),
"B_c": arr([0, 0, 1], dtype=typ),
"cat_x": arr([1, 0, 0], dtype=typ),
"cat_y": arr([0, 1, 1], dtype=typ),
}
).sort_index(axis=1)
assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"get_dummies_kwargs,expected",
[
(
{"data": pd.DataFrame(({"ä": ["a"]}))},
pd.DataFrame({"ä_a": [1]}, dtype=np.uint8),
),
(
{"data": pd.DataFrame({"x": ["ä"]})},
pd.DataFrame({"x_ä": [1]}, dtype=np.uint8),
),
(
{"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"},
pd.DataFrame({"ä_a": [1]}, dtype=np.uint8),
),
(
{"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
pd.DataFrame({"xäa": [1]}, dtype=np.uint8),
),
],
)
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
# GH22084 pd.get_dummies incorrectly encodes unicode characters
# in dataframe column names
result = get_dummies(**get_dummies_kwargs)
assert_frame_equal(result, expected)
def test_basic_drop_first(self, sparse):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
s_list = list("abc")
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
expected = expected.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(result, expected)
result = get_dummies(s_series, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
expected.index = list("ABC")
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
def test_basic_drop_first_one_level(self, sparse):
# Test the case that categorical variable only has one level.
s_list = list("aaa")
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))
expected = DataFrame(index=np.arange(3))
result = get_dummies(s_list, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
result = get_dummies(s_series, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
expected = DataFrame(index=list("ABC"))
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
assert_frame_equal(result, expected)
def test_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res, exp)
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
exp_na = DataFrame({"b": [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex(
["b", nan], axis=1
)
if sparse:
exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res_na, exp_na)
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse)
exp_just_na = DataFrame(index=np.arange(1))
assert_frame_equal(res_just_na, exp_just_na)
def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
if sparse:
expected = expected.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
df["cat"] = pd.Categorical(["x", "y", "y"])
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame(
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
)
cols = ["A_b", "B_c", "cat_y"]
expected[cols] = expected[cols].astype(np.uint8)
expected = expected[["C", "A_b", "B_c", "cat_y"]]
if sparse:
for col in cols:
expected[col] = pd.SparseSeries(expected[col])
assert_frame_equal(result, expected)
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(
df, dummy_na=True, drop_first=True, sparse=sparse
).sort_index(axis=1)
expected = DataFrame(
{
"C": [1, 2, 3, np.nan],
"A_b": [0, 1, 0, 0],
"A_nan": [0, 0, 0, 1],
"B_c": [0, 0, 1, 0],
"B_nan": [0, 0, 0, 1],
}
)
cols = ["A_b", "A_nan", "B_c", "B_nan"]
expected[cols] = expected[cols].astype(np.uint8)
expected = expected.sort_index(axis=1)
if sparse:
for col in cols:
expected[col] = pd.SparseSeries(expected[col])
assert_frame_equal(result, expected)
result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
expected = expected[["C", "A_b", "B_c"]]
assert_frame_equal(result, expected)
def test_int_int(self):
data = Series([1, 2, 1])
result = pd.get_dummies(data)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
tm.assert_frame_equal(result, expected)
data = Series(pd.Categorical(["a", "b", "a"]))
result = pd.get_dummies(data)
expected = DataFrame(
[[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8
)
tm.assert_frame_equal(result, expected)
def test_int_df(self, dtype):
data = DataFrame(
{
"A": [1, 2, 1],
"B": pd.Categorical(["a", "b", "a"]),
"C": [1, 2, 1],
"D": [1.0, 2.0, 1.0],
}
)
columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
expected = DataFrame(
[[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
columns=columns,
)
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
# GH13854
for ordered in [False, True]:
cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
result = get_dummies(cat, dtype=dtype)
data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
cols = pd.CategoricalIndex(
cat.categories, categories=cat.categories, ordered=ordered
)
expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("sparse", [True, False])
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_dict(
OrderedDict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])])
)
df = get_dummies(df, columns=["Nation"], sparse=sparse)
df2 = df.reindex(columns=["GDP"])
tm.assert_frame_equal(df[["GDP"]], df2)
def test_get_dummies_duplicate_columns(self, df):
# GH20839
df.columns = ["A", "A", "A"]
result = get_dummies(df).sort_index(axis=1)
expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
columns=["A", "A_a", "A_b", "A_b", "A_c"],
dtype=np.uint8,
).sort_index(axis=1)
expected = expected.astype({"A": np.int64})
tm.assert_frame_equal(result, expected)
def test_get_dummies_all_sparse(self):
df = pd.DataFrame({"A": [1, 2]})
result = pd.get_dummies(df, columns=["A"], sparse=True)
dtype = SparseDtype("uint8", 0)
expected = pd.DataFrame(
{
"A_1": SparseArray([1, 0], dtype=dtype),
"A_2": SparseArray([0, 1], dtype=dtype),
}
)
tm.assert_frame_equal(result, expected)
class TestCategoricalReshape:
def test_reshaping_multi_index_categorical(self):
cols = ["ItemA", "ItemB", "ItemC"]
data = {c: tm.makeTimeDataFrame() for c in cols}
df = pd.concat({c: data[c].stack() for c in data}, axis="columns")
df.index.names = ["major", "minor"]
df["str"] = "foo"
dti = df.index.levels[0]
df["category"] = df["str"].astype("category")
result = df["category"].unstack()
c = Categorical(["foo"] * len(dti))
expected = DataFrame(
{"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()},
columns=Index(list("ABCD"), name="minor"),
index=dti,
)
tm.assert_frame_equal(result, expected)
class TestMakeAxisDummies:
def test_preserve_categorical_dtype(self):
# GH13854
for ordered in [False, True]:
cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
midx = pd.MultiIndex(levels=[["a"], cidx], codes=[[0, 0], [0, 1]])
df = DataFrame([[10, 11]], index=midx)
expected = DataFrame(
[[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], index=midx, columns=cidx
)
from pandas.core.reshape.reshape import make_axis_dummies
result = make_axis_dummies(df)
tm.assert_frame_equal(result, expected)
result = make_axis_dummies(df, transform=lambda x: x)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,348 @@
import numpy as np
import pytest
from pandas.core.dtypes.concat import union_categoricals
import pandas as pd
from pandas import Categorical, CategoricalIndex, Series
from pandas.util import testing as tm
class TestUnionCategoricals:
def test_union_categorical(self):
# GH 13361
data = [
(list("abc"), list("abd"), list("abcabd")),
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
(
["b", "b", np.nan, "a"],
["a", np.nan, "c"],
["b", "b", np.nan, "a", "a", np.nan, "c"],
),
(
pd.date_range("2014-01-01", "2014-01-05"),
pd.date_range("2014-01-06", "2014-01-07"),
pd.date_range("2014-01-01", "2014-01-07"),
),
(
pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
),
(
pd.period_range("2014-01-01", "2014-01-05"),
pd.period_range("2014-01-06", "2014-01-07"),
pd.period_range("2014-01-01", "2014-01-07"),
),
]
for a, b, combined in data:
for box in [Categorical, CategoricalIndex, Series]:
result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
expected = Categorical(combined)
tm.assert_categorical_equal(result, expected, check_category_order=True)
# new categories ordered by appearance
s = Categorical(["x", "y", "z"])
s2 = Categorical(["a", "b", "c"])
result = union_categoricals([s, s2])
expected = Categorical(
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
)
tm.assert_categorical_equal(result, expected)
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)
# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
msg = "dtype of categories must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([s, s2])
msg = "No Categoricals to union"
with pytest.raises(ValueError, match=msg):
union_categoricals([])
def test_union_categoricals_nan(self):
# GH 13759
res = union_categoricals(
[pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]
)
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
tm.assert_categorical_equal(res, exp)
res = union_categoricals(
[pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])]
)
exp = Categorical(["A", "B", "B", "B", np.nan])
tm.assert_categorical_equal(res, exp)
val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
exp = Categorical(
val1 + val2,
categories=[
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-03-01"),
pd.Timestamp("2011-02-01"),
],
)
tm.assert_categorical_equal(res, exp)
# all NaN
res = union_categoricals(
[
pd.Categorical(np.array([np.nan, np.nan], dtype=object)),
pd.Categorical(["X"]),
]
)
exp = Categorical([np.nan, np.nan, "X"])
tm.assert_categorical_equal(res, exp)
res = union_categoricals(
[pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]
)
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
tm.assert_categorical_equal(res, exp)
def test_union_categoricals_empty(self):
# GH 13759
res = union_categoricals([pd.Categorical([]), pd.Categorical([])])
exp = Categorical([])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([Categorical([]), Categorical(["1"])])
exp = Categorical(["1"])
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)
c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
res = union_categoricals([c1, c2])
exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
tm.assert_categorical_equal(res, exp)
def test_union_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/19096
c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
result = union_categoricals([c1, c2])
expected = Categorical(
["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
)
tm.assert_categorical_equal(result, expected)
def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)
msg = "Categorical.ordered must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2])
res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
msg = "to union ordered Categoricals, all categories must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2])
def test_union_categoricals_ignore_order(self):
# GH 15219
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
msg = "Categorical.ordered must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2], ignore_order=False)
res = union_categoricals([c1, c1], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([c1, c1], ignore_order=False)
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, np.nan, 3, 2])
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
res = union_categoricals([c1, c2], ignore_order=True)
exp = Categorical([1, 2, 3, 1, 2, 3])
tm.assert_categorical_equal(res, exp)
res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(res, exp)
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([4, 5, 6], ordered=True)
result = union_categoricals([c1, c2], ignore_order=True)
expected = Categorical([1, 2, 3, 4, 5, 6])
tm.assert_categorical_equal(result, expected)
msg = "to union ordered Categoricals, all categories must be the same"
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2], ignore_order=False)
with pytest.raises(TypeError, match=msg):
union_categoricals([c1, c2])
def test_union_categoricals_sort(self):
# GH 13846
c1 = Categorical(["x", "y", "z"])
c2 = Categorical(["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(
["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
)
tm.assert_categorical_equal(result, expected)
# fastpath
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
# fastpath - skip resort
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["x", np.nan])
c2 = Categorical([np.nan, "b"])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
with pytest.raises(TypeError):
union_categoricals([c1, c2], sort_categories=True)
def test_union_categoricals_sort_false(self):
# GH 13846
c1 = Categorical(["x", "y", "z"])
c2 = Categorical(["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
)
tm.assert_categorical_equal(result, expected)
# fastpath
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
tm.assert_categorical_equal(result, expected)
# fastpath - skip resort
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["x", np.nan])
c2 = Categorical([np.nan, "b"])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)
c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical(
["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_union_categorical_unwrap(self):
# GH 14173
c1 = Categorical(["a", "b"])
c2 = pd.Series(["b", "c"], dtype="category")
result = union_categoricals([c1, c2])
expected = Categorical(["a", "b", "b", "c"])
tm.assert_categorical_equal(result, expected)
c2 = CategoricalIndex(c2)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)
c1 = Series(c1)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)
with pytest.raises(TypeError):
union_categoricals([c1, ["a", "b", "c"]])

View File

@@ -0,0 +1,51 @@
import numpy as np
import pytest
from pandas import Index, date_range
from pandas.core.reshape.util import cartesian_product
import pandas.util.testing as tm
class TestCartesianProduct:
def test_simple(self):
x, y = list("ABC"), [1, 22]
result1, result2 = cartesian_product([x, y])
expected1 = np.array(["A", "A", "B", "B", "C", "C"])
expected2 = np.array([1, 22, 1, 22, 1, 22])
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
def test_datetimeindex(self):
# regression test for GitHub issue #6439
# make sure that the ordering on datetimeindex is consistent
x = date_range("2000-01-01", periods=2)
result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
expected1 = Index([1, 1, 2, 2])
expected2 = Index([1, 2, 1, 2])
tm.assert_index_equal(result1, expected1)
tm.assert_index_equal(result2, expected2)
def test_empty(self):
# product of empty factors
X = [[], [0, 1], []]
Y = [[], [], ["a", "b", "c"]]
for x, y in zip(X, Y):
expected1 = np.array([], dtype=np.asarray(x).dtype)
expected2 = np.array([], dtype=np.asarray(y).dtype)
result1, result2 = cartesian_product([x, y])
tm.assert_numpy_array_equal(result1, expected1)
tm.assert_numpy_array_equal(result2, expected2)
# empty product (empty input):
result = cartesian_product([])
expected = []
assert result == expected
@pytest.mark.parametrize(
"X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]]
)
def test_invalid_input(self, X):
msg = "Input must be a list-like of list-likes"
with pytest.raises(TypeError, match=msg):
cartesian_product(X=X)