8th day of python challenges 111-117
This commit is contained in:
@@ -0,0 +1,862 @@
|
||||
import numpy as np
|
||||
from numpy.random import randn
|
||||
import pytest
|
||||
|
||||
from pandas._libs import join as libjoin
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
|
||||
from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
a_ = np.array
|
||||
|
||||
|
||||
class TestJoin:
|
||||
def setup_method(self, method):
|
||||
# aggregate multiple columns
|
||||
self.df = DataFrame(
|
||||
{
|
||||
"key1": get_test_data(),
|
||||
"key2": get_test_data(),
|
||||
"data1": np.random.randn(N),
|
||||
"data2": np.random.randn(N),
|
||||
}
|
||||
)
|
||||
|
||||
# exclude a couple keys for fun
|
||||
self.df = self.df[self.df["key2"] > 1]
|
||||
|
||||
self.df2 = DataFrame(
|
||||
{
|
||||
"key1": get_test_data(n=N // 5),
|
||||
"key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5),
|
||||
"value": np.random.randn(N // 5),
|
||||
}
|
||||
)
|
||||
|
||||
index, data = tm.getMixedTypeDict()
|
||||
self.target = DataFrame(data, index=index)
|
||||
|
||||
# Join on string value
|
||||
self.source = DataFrame(
|
||||
{"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
|
||||
)
|
||||
|
||||
def test_cython_left_outer_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = libjoin.left_outer_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
|
||||
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_right_outer_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
rs, ls = libjoin.left_outer_join(right, left, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
# 0 1 1 1
|
||||
exp_li = a_(
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
# 2 2 4
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
-1,
|
||||
]
|
||||
)
|
||||
exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_inner_join(self):
|
||||
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
|
||||
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = libjoin.inner_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
|
||||
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_left_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="left")
|
||||
|
||||
joined_both = merge(self.df, self.df2)
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left")
|
||||
|
||||
def test_right_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2", how="right")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="right")
|
||||
|
||||
joined_both = merge(self.df, self.df2, how="right")
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right")
|
||||
|
||||
def test_full_outer_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")
|
||||
|
||||
joined_both = merge(self.df, self.df2, how="outer")
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer")
|
||||
|
||||
def test_inner_join(self):
|
||||
joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
|
||||
_check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")
|
||||
|
||||
joined_both = merge(self.df, self.df2, how="inner")
|
||||
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")
|
||||
|
||||
def test_handle_overlap(self):
|
||||
joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"])
|
||||
|
||||
assert "key1.foo" in joined
|
||||
assert "key1.bar" in joined
|
||||
|
||||
def test_handle_overlap_arbitrary_key(self):
|
||||
joined = merge(
|
||||
self.df,
|
||||
self.df2,
|
||||
left_on="key2",
|
||||
right_on="key1",
|
||||
suffixes=[".foo", ".bar"],
|
||||
)
|
||||
assert "key1.foo" in joined
|
||||
assert "key2.bar" in joined
|
||||
|
||||
def test_join_on(self):
|
||||
target = self.target
|
||||
source = self.source
|
||||
|
||||
merged = target.join(source, on="C")
|
||||
tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
|
||||
tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)
|
||||
|
||||
# join with duplicates (fix regression from DataFrame/Matrix merge)
|
||||
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
|
||||
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
|
||||
joined = df.join(df2, on="key")
|
||||
expected = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
|
||||
)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# Test when some are missing
|
||||
df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
|
||||
df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
|
||||
df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
|
||||
joined = df_a.join(df_b, on="one")
|
||||
joined = joined.join(df_c, on="one")
|
||||
assert np.isnan(joined["two"]["c"])
|
||||
assert np.isnan(joined["three"]["c"])
|
||||
|
||||
# merge column not p resent
|
||||
with pytest.raises(KeyError, match="^'E'$"):
|
||||
target.join(source, on="E")
|
||||
|
||||
# overlap
|
||||
source_copy = source.copy()
|
||||
source_copy["A"] = 0
|
||||
msg = (
|
||||
"You are trying to merge on float64 and object columns. If"
|
||||
" you wish to proceed you should use pd.concat"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
target.join(source_copy, on="A")
|
||||
|
||||
def test_join_on_fails_with_different_right_index(self):
|
||||
df = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2),
|
||||
)
|
||||
msg = (
|
||||
r"len\(left_on\) must equal the number of levels in the index" ' of "right"'
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, left_on="a", right_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_left_index(self):
|
||||
df = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)},
|
||||
index=tm.makeCustomIndex(3, 2),
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}
|
||||
)
|
||||
msg = (
|
||||
r"len\(right_on\) must equal the number of levels in the index" ' of "left"'
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, right_on="b", left_index=True)
|
||||
|
||||
def test_join_on_fails_with_different_column_counts(self):
|
||||
df = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
|
||||
index=tm.makeCustomIndex(10, 2),
|
||||
)
|
||||
msg = r"len\(right_on\) must equal len\(left_on\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
merge(df, df2, right_on="a", left_on=["a", "b"])
|
||||
|
||||
@pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
|
||||
def test_join_on_fails_with_wrong_object_type(self, wrong_type):
|
||||
# GH12081 - original issue
|
||||
|
||||
# GH21220 - merging of Series and DataFrame is now allowed
|
||||
# Edited test to remove the Series object from test parameters
|
||||
|
||||
df = DataFrame({"a": [1, 1]})
|
||||
msg = "Can only merge Series or DataFrame objects, a {} was passed".format(
|
||||
str(type(wrong_type))
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
merge(wrong_type, df, left_on="a", right_on="a")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
merge(df, wrong_type, left_on="a", right_on="a")
|
||||
|
||||
def test_join_on_pass_vector(self):
|
||||
expected = self.target.join(self.source, on="C")
|
||||
del expected["C"]
|
||||
|
||||
join_col = self.target.pop("C")
|
||||
result = self.target.join(self.source, on=join_col)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_with_len0(self):
|
||||
# nothing to merge
|
||||
merged = self.target.join(self.source.reindex([]), on="C")
|
||||
for col in self.source:
|
||||
assert col in merged
|
||||
assert merged[col].isna().all()
|
||||
|
||||
merged2 = self.target.join(self.source.reindex([]), on="C", how="inner")
|
||||
tm.assert_index_equal(merged2.columns, merged.columns)
|
||||
assert len(merged2) == 0
|
||||
|
||||
def test_join_on_inner(self):
|
||||
df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
|
||||
df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])
|
||||
|
||||
joined = df.join(df2, on="key", how="inner")
|
||||
|
||||
expected = df.join(df2, on="key")
|
||||
expected = expected[expected["value"].notna()]
|
||||
tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False)
|
||||
tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
|
||||
tm.assert_index_equal(joined.index, expected.index)
|
||||
|
||||
def test_join_on_singlekey_list(self):
|
||||
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
|
||||
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
|
||||
|
||||
# corner cases
|
||||
joined = df.join(df2, on=["key"])
|
||||
expected = df.join(df2, on="key")
|
||||
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_on_series(self):
|
||||
result = self.target.join(self.source["MergedA"], on="C")
|
||||
expected = self.target.join(self.source[["MergedA"]], on="C")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_on_series_buglet(self):
|
||||
# GH #638
|
||||
df = DataFrame({"a": [1, 1]})
|
||||
ds = Series([2], index=[1], name="b")
|
||||
result = df.join(ds, on="a")
|
||||
expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_index_mixed(self, join_type):
|
||||
# no overlapping blocks
|
||||
df1 = DataFrame(index=np.arange(10))
|
||||
df1["bool"] = True
|
||||
df1["string"] = "foo"
|
||||
|
||||
df2 = DataFrame(index=np.arange(5, 15))
|
||||
df2["int"] = 1
|
||||
df2["float"] = 1.0
|
||||
|
||||
joined = df1.join(df2, how=join_type)
|
||||
expected = _join_by_hand(df1, df2, how=join_type)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
joined = df2.join(df1, how=join_type)
|
||||
expected = _join_by_hand(df2, df1, how=join_type)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_index_mixed_overlap(self):
|
||||
df1 = DataFrame(
|
||||
{"A": 1.0, "B": 2, "C": "foo", "D": True},
|
||||
index=np.arange(10),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
assert df1["B"].dtype == np.int64
|
||||
assert df1["D"].dtype == np.bool_
|
||||
|
||||
df2 = DataFrame(
|
||||
{"A": 1.0, "B": 2, "C": "foo", "D": True},
|
||||
index=np.arange(0, 10, 2),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
|
||||
# overlap
|
||||
joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
|
||||
expected_columns = [
|
||||
"A_one",
|
||||
"B_one",
|
||||
"C_one",
|
||||
"D_one",
|
||||
"A_two",
|
||||
"B_two",
|
||||
"C_two",
|
||||
"D_two",
|
||||
]
|
||||
df1.columns = expected_columns[:4]
|
||||
df2.columns = expected_columns[4:]
|
||||
expected = _join_by_hand(df1, df2)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
def test_join_empty_bug(self):
|
||||
# generated an exception in 0.4.3
|
||||
x = DataFrame()
|
||||
x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")
|
||||
|
||||
def test_join_unconsolidated(self):
|
||||
# GH #331
|
||||
a = DataFrame(randn(30, 2), columns=["a", "b"])
|
||||
c = Series(randn(30))
|
||||
a["c"] = c
|
||||
d = DataFrame(randn(30, 1), columns=["q"])
|
||||
|
||||
# it works!
|
||||
a.join(d)
|
||||
d.join(a)
|
||||
|
||||
def test_join_multiindex(self):
|
||||
index1 = MultiIndex.from_arrays(
|
||||
[["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
|
||||
index2 = MultiIndex.from_arrays(
|
||||
[["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
|
||||
df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
|
||||
df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])
|
||||
|
||||
df1 = df1.sort_index(level=0)
|
||||
df2 = df2.sort_index(level=0)
|
||||
|
||||
joined = df1.join(df2, how="outer")
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
df1 = df1.sort_index(level=1)
|
||||
df2 = df2.sort_index(level=1)
|
||||
|
||||
joined = df1.join(df2, how="outer").sort_index(level=0)
|
||||
ex_index = Index(index1.values).union(Index(index2.values))
|
||||
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
|
||||
expected.index.names = index1.names
|
||||
|
||||
assert_frame_equal(joined, expected)
|
||||
assert joined.index.names == index1.names
|
||||
|
||||
def test_join_inner_multiindex(self):
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = [
|
||||
"two",
|
||||
"one",
|
||||
"three",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"three",
|
||||
"one",
|
||||
]
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
data = DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
index = MultiIndex(
|
||||
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
to_join = DataFrame(
|
||||
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
|
||||
)
|
||||
|
||||
joined = data.join(to_join, on=["key1", "key2"], how="inner")
|
||||
expected = merge(
|
||||
data,
|
||||
to_join.reset_index(),
|
||||
left_on=["key1", "key2"],
|
||||
right_on=["first", "second"],
|
||||
how="inner",
|
||||
sort=False,
|
||||
)
|
||||
|
||||
expected2 = merge(
|
||||
to_join,
|
||||
data,
|
||||
right_on=["key1", "key2"],
|
||||
left_index=True,
|
||||
how="inner",
|
||||
sort=False,
|
||||
)
|
||||
assert_frame_equal(joined, expected2.reindex_like(joined))
|
||||
|
||||
expected2 = merge(
|
||||
to_join,
|
||||
data,
|
||||
right_on=["key1", "key2"],
|
||||
left_index=True,
|
||||
how="inner",
|
||||
sort=False,
|
||||
)
|
||||
|
||||
expected = expected.drop(["first", "second"], axis=1)
|
||||
expected.index = joined.index
|
||||
|
||||
assert joined.index.is_monotonic
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
|
||||
|
||||
def test_join_hierarchical_mixed(self):
|
||||
# GH 2024
|
||||
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
|
||||
new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
|
||||
other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
|
||||
other_df.set_index("a", inplace=True)
|
||||
# GH 9455, 12219
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
result = merge(new_df, other_df, left_index=True, right_index=True)
|
||||
assert ("b", "mean") in result
|
||||
assert "b" in result
|
||||
|
||||
def test_join_float64_float32(self):
|
||||
|
||||
a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64)
|
||||
b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32)
|
||||
joined = a.join(b)
|
||||
assert joined.dtypes["a"] == "float64"
|
||||
assert joined.dtypes["b"] == "float64"
|
||||
assert joined.dtypes["c"] == "float32"
|
||||
|
||||
a = np.random.randint(0, 5, 100).astype("int64")
|
||||
b = np.random.random(100).astype("float64")
|
||||
c = np.random.random(100).astype("float32")
|
||||
df = DataFrame({"a": a, "b": b, "c": c})
|
||||
xpdf = DataFrame({"a": a, "b": b, "c": c})
|
||||
s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
|
||||
rs = df.merge(s, left_on="a", right_index=True)
|
||||
assert rs.dtypes["a"] == "int64"
|
||||
assert rs.dtypes["b"] == "float64"
|
||||
assert rs.dtypes["c"] == "float32"
|
||||
assert rs.dtypes["md"] == "float32"
|
||||
|
||||
xp = xpdf.merge(s, left_on="a", right_index=True)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_join_many_non_unique_index(self):
|
||||
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
|
||||
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
|
||||
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
|
||||
result = idf1.join([idf2, idf3], how="outer")
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
|
||||
expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")
|
||||
|
||||
result = result.reset_index()
|
||||
expected = expected[result.columns]
|
||||
expected["a"] = expected.a.astype("int64")
|
||||
expected["b"] = expected.b.astype("int64")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
|
||||
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
|
||||
df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
|
||||
idf1 = df1.set_index(["a", "b"])
|
||||
idf2 = df2.set_index(["a", "b"])
|
||||
idf3 = df3.set_index(["a", "b"])
|
||||
result = idf1.join([idf2, idf3], how="inner")
|
||||
|
||||
df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
|
||||
expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")
|
||||
|
||||
result = result.reset_index()
|
||||
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# GH 11519
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.randn(8),
|
||||
"D": np.random.randn(8),
|
||||
}
|
||||
)
|
||||
s = Series(
|
||||
np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
|
||||
)
|
||||
inner = df.join(s, how="inner")
|
||||
outer = df.join(s, how="outer")
|
||||
left = df.join(s, how="left")
|
||||
right = df.join(s, how="right")
|
||||
assert_frame_equal(inner, outer)
|
||||
assert_frame_equal(inner, left)
|
||||
assert_frame_equal(inner, right)
|
||||
|
||||
def test_join_sort(self):
|
||||
left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
|
||||
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
|
||||
|
||||
joined = left.join(right, on="key", sort=True)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["bar", "baz", "foo", "foo"],
|
||||
"value": [2, 3, 1, 4],
|
||||
"value2": ["a", "b", "c", "c"],
|
||||
},
|
||||
index=[1, 2, 0, 3],
|
||||
)
|
||||
assert_frame_equal(joined, expected)
|
||||
|
||||
# smoke test
|
||||
joined = left.join(right, on="key", sort=False)
|
||||
tm.assert_index_equal(joined.index, pd.Index(list(range(4))))
|
||||
|
||||
def test_join_mixed_non_unique_index(self):
|
||||
# GH 12814, unorderable types in py3 with a non-unique index
|
||||
df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
|
||||
df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
|
||||
result = df1.join(df2)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
|
||||
index=[1, 2, 3, 3, "a"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
|
||||
df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
|
||||
result = df3.join(df4)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_non_unique_period_index(self):
|
||||
# GH #16871
|
||||
index = pd.period_range("2016-01-01", periods=16, freq="M")
|
||||
df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"])
|
||||
df2 = concat([df, df])
|
||||
result = df.join(df2, how="inner", rsuffix="_df2")
|
||||
expected = DataFrame(
|
||||
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
|
||||
columns=["pnum", "pnum_df2"],
|
||||
index=df2.sort_index().index,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_join_with_suffix(self):
|
||||
# GH #916
|
||||
df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
|
||||
df.insert(0, "id", 0)
|
||||
df.insert(5, "dt", "foo")
|
||||
|
||||
grouped = df.groupby("id")
|
||||
mn = grouped.mean()
|
||||
cn = grouped.count()
|
||||
|
||||
# it works!
|
||||
mn.join(cn, rsuffix="_right")
|
||||
|
||||
def test_join_many(self):
|
||||
df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
|
||||
df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
tm.assert_frame_equal(joined, df)
|
||||
|
||||
df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]
|
||||
|
||||
def _check_diff_index(df_list, result, exp_index):
|
||||
reindexed = [x.reindex(exp_index) for x in df_list]
|
||||
expected = reindexed[0].join(reindexed[1:])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# different join types
|
||||
joined = df_list[0].join(df_list[1:], how="outer")
|
||||
_check_diff_index(df_list, joined, df.index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:])
|
||||
_check_diff_index(df_list, joined, df_list[0].index)
|
||||
|
||||
joined = df_list[0].join(df_list[1:], how="inner")
|
||||
_check_diff_index(df_list, joined, df.index[2:8])
|
||||
|
||||
msg = "Joining multiple DataFrames only supported for joining on index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df_list[0].join(df_list[1:], on="a")
|
||||
|
||||
def test_join_many_mixed(self):
|
||||
df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
|
||||
df["key"] = ["foo", "bar"] * 4
|
||||
df1 = df.loc[:, ["A", "B"]]
|
||||
df2 = df.loc[:, ["C", "D"]]
|
||||
df3 = df.loc[:, ["key"]]
|
||||
|
||||
result = df1.join([df2, df3])
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_join_dups(self):
|
||||
|
||||
# joining dups
|
||||
df = concat(
|
||||
[
|
||||
DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
|
||||
DataFrame(
|
||||
np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
|
||||
),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
expected = concat([df, df], axis=1)
|
||||
result = df.join(df, rsuffix="_2")
|
||||
result.columns = expected.columns
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH 4975, invalid join on dups
|
||||
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
|
||||
|
||||
dta = x.merge(y, left_index=True, right_index=True).merge(
|
||||
z, left_index=True, right_index=True, how="outer"
|
||||
)
|
||||
dta = dta.merge(w, left_index=True, right_index=True)
|
||||
expected = concat([x, y, z, w], axis=1)
|
||||
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
|
||||
assert_frame_equal(dta, expected)
|
||||
|
||||
def test_join_multi_to_multi(self, join_type):
|
||||
# GH 20475
|
||||
leftindex = MultiIndex.from_product(
|
||||
[list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
|
||||
)
|
||||
left = DataFrame({"v1": range(12)}, index=leftindex)
|
||||
|
||||
rightindex = MultiIndex.from_product(
|
||||
[list("abc"), list("xy")], names=["abc", "xy"]
|
||||
)
|
||||
right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)
|
||||
|
||||
result = left.join(right, on=["abc", "xy"], how=join_type)
|
||||
expected = (
|
||||
left.reset_index()
|
||||
.merge(right.reset_index(), on=["abc", "xy"], how=join_type)
|
||||
.set_index(["abc", "xy", "num"])
|
||||
)
|
||||
assert_frame_equal(expected, result)
|
||||
|
||||
msg = (
|
||||
r"len\(left_on\) must equal the number of levels in the index" ' of "right"'
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
left.join(right, on="xy", how=join_type)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
right.join(left, on=["abc", "xy"], how=join_type)
|
||||
|
||||
def test_join_on_tz_aware_datetimeindex(self):
|
||||
# GH 23931, 26335
|
||||
df1 = pd.DataFrame(
|
||||
{
|
||||
"date": pd.date_range(
|
||||
start="2018-01-01", periods=5, tz="America/Chicago"
|
||||
),
|
||||
"vals": list("abcde"),
|
||||
}
|
||||
)
|
||||
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"date": pd.date_range(
|
||||
start="2018-01-03", periods=5, tz="America/Chicago"
|
||||
),
|
||||
"vals_2": list("tuvwx"),
|
||||
}
|
||||
)
|
||||
result = df1.join(df2.set_index("date"), on="date")
|
||||
expected = df1.copy()
|
||||
expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
|
||||
|
||||
# some smoke tests
|
||||
for c in join_col:
|
||||
assert result[c].notna().all()
|
||||
|
||||
left_grouped = left.groupby(join_col)
|
||||
right_grouped = right.groupby(join_col)
|
||||
|
||||
for group_key, group in result.groupby(join_col):
|
||||
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
|
||||
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
|
||||
|
||||
try:
|
||||
lgroup = left_grouped.get_group(group_key)
|
||||
except KeyError:
|
||||
if how in ("left", "inner"):
|
||||
raise AssertionError(
|
||||
"key {group_key!s} should not have been in the join".format(
|
||||
group_key=group_key
|
||||
)
|
||||
)
|
||||
|
||||
_assert_all_na(l_joined, left.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(l_joined, lgroup)
|
||||
|
||||
try:
|
||||
rgroup = right_grouped.get_group(group_key)
|
||||
except KeyError:
|
||||
if how in ("right", "inner"):
|
||||
raise AssertionError(
|
||||
"key {group_key!s} should not have been in the join".format(
|
||||
group_key=group_key
|
||||
)
|
||||
)
|
||||
|
||||
_assert_all_na(r_joined, right.columns, join_col)
|
||||
else:
|
||||
_assert_same_contents(r_joined, rgroup)
|
||||
|
||||
|
||||
def _restrict_to_columns(group, columns, suffix):
|
||||
found = [
|
||||
c for c in group.columns if c in columns or c.replace(suffix, "") in columns
|
||||
]
|
||||
|
||||
# filter
|
||||
group = group.loc[:, found]
|
||||
|
||||
# get rid of suffixes, if any
|
||||
group = group.rename(columns=lambda x: x.replace(suffix, ""))
|
||||
|
||||
# put in the right order...
|
||||
group = group.loc[:, columns]
|
||||
|
||||
return group
|
||||
|
||||
|
||||
def _assert_same_contents(join_chunk, source):
|
||||
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
|
||||
|
||||
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
|
||||
|
||||
rows = {tuple(row) for row in jvalues}
|
||||
assert len(rows) == len(source)
|
||||
assert all(tuple(row) in rows for row in svalues)
|
||||
|
||||
|
||||
def _assert_all_na(join_chunk, source_columns, join_col):
|
||||
for c in source_columns:
|
||||
if c in join_col:
|
||||
continue
|
||||
assert join_chunk[c].isna().all()
|
||||
|
||||
|
||||
def _join_by_hand(a, b, how="left"):
|
||||
join_index = a.index.join(b.index, how=how)
|
||||
|
||||
a_re = a.reindex(join_index)
|
||||
b_re = b.reindex(join_index)
|
||||
|
||||
result_columns = a.columns.append(b.columns)
|
||||
|
||||
for col, s in b_re.items():
|
||||
a_re[col] = s
|
||||
return a_re.reindex(columns=result_columns)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,188 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df1():
|
||||
return DataFrame(
|
||||
dict(
|
||||
outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
|
||||
inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
|
||||
v1=np.linspace(0, 1, 11),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df2():
|
||||
return DataFrame(
|
||||
dict(
|
||||
outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
|
||||
inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
|
||||
v2=np.linspace(10, 11, 12),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def left_df(request, df1):
|
||||
""" Construct left test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v1')"""
|
||||
levels = request.param
|
||||
if levels:
|
||||
df1 = df1.set_index(levels)
|
||||
|
||||
return df1
|
||||
|
||||
|
||||
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
|
||||
def right_df(request, df2):
|
||||
""" Construct right test DataFrame with specified levels
|
||||
(any of 'outer', 'inner', and 'v2')"""
|
||||
levels = request.param
|
||||
|
||||
if levels:
|
||||
df2 = df2.set_index(levels)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
|
||||
"""
|
||||
Compute the expected merge result for the test case.
|
||||
|
||||
This method computes the expected result of merging two DataFrames on
|
||||
a combination of their columns and index levels. It does so by
|
||||
explicitly dropping/resetting their named index levels, performing a
|
||||
merge on their columns, and then finally restoring the appropriate
|
||||
index in the result.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_left : DataFrame
|
||||
The left DataFrame (may have zero or more named index levels)
|
||||
df_right : DataFrame
|
||||
The right DataFrame (may have zero or more named index levels)
|
||||
on : list of str
|
||||
The on parameter to the merge operation
|
||||
left_on : list of str
|
||||
The left_on parameter to the merge operation
|
||||
right_on : list of str
|
||||
The right_on parameter to the merge operation
|
||||
how : str
|
||||
The how parameter to the merge operation
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
The expected merge result
|
||||
"""
|
||||
|
||||
# Handle on param if specified
|
||||
if on is not None:
|
||||
left_on, right_on = on, on
|
||||
|
||||
# Compute input named index levels
|
||||
left_levels = [n for n in df_left.index.names if n is not None]
|
||||
right_levels = [n for n in df_right.index.names if n is not None]
|
||||
|
||||
# Compute output named index levels
|
||||
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
|
||||
|
||||
# Drop index levels that aren't involved in the merge
|
||||
drop_left = [n for n in left_levels if n not in left_on]
|
||||
if drop_left:
|
||||
df_left = df_left.reset_index(drop_left, drop=True)
|
||||
|
||||
drop_right = [n for n in right_levels if n not in right_on]
|
||||
if drop_right:
|
||||
df_right = df_right.reset_index(drop_right, drop=True)
|
||||
|
||||
# Convert remaining index levels to columns
|
||||
reset_left = [n for n in left_levels if n in left_on]
|
||||
if reset_left:
|
||||
df_left = df_left.reset_index(level=reset_left)
|
||||
|
||||
reset_right = [n for n in right_levels if n in right_on]
|
||||
if reset_right:
|
||||
df_right = df_right.reset_index(level=reset_right)
|
||||
|
||||
# Perform merge
|
||||
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
|
||||
|
||||
# Restore index levels
|
||||
if output_levels:
|
||||
expected = expected.set_index(output_levels)
|
||||
|
||||
return expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"on,how",
|
||||
[
|
||||
(["outer"], "inner"),
|
||||
(["inner"], "left"),
|
||||
(["outer", "inner"], "right"),
|
||||
(["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(left_df, right_df, on=on, how=how)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, on=on, how=how)
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left_on,right_on,how",
|
||||
[
|
||||
(["outer"], ["outer"], "inner"),
|
||||
(["inner"], ["inner"], "right"),
|
||||
(["outer", "inner"], ["outer", "inner"], "left"),
|
||||
(["inner", "outer"], ["inner", "outer"], "outer"),
|
||||
],
|
||||
)
|
||||
def test_merge_indexes_and_columns_lefton_righton(
|
||||
left_df, right_df, left_on, right_on, how
|
||||
):
|
||||
|
||||
# Construct expected result
|
||||
expected = compute_expected(
|
||||
left_df, right_df, left_on=left_on, right_on=right_on, how=how
|
||||
)
|
||||
|
||||
# Perform merge
|
||||
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
|
||||
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
|
||||
|
||||
# Construct left_df
|
||||
left_df = df1.set_index(left_index)
|
||||
|
||||
# Construct right_df
|
||||
right_df = df2.set_index(["outer", "inner"])
|
||||
|
||||
# Result
|
||||
expected = (
|
||||
left_df.reset_index()
|
||||
.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
.set_index(left_index)
|
||||
)
|
||||
|
||||
# Perform join
|
||||
result = left_df.join(
|
||||
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected, check_like=True)
|
||||
@@ -0,0 +1,117 @@
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, merge_ordered
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
class TestMergeOrdered:
|
||||
def setup_method(self, method):
|
||||
self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
|
||||
|
||||
self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
|
||||
|
||||
def test_basic(self):
|
||||
result = merge_ordered(self.left, self.right, on="key")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1, nan, 2, nan, 3, nan],
|
||||
"rvalue": [nan, 1, 2, 3, nan, 4],
|
||||
}
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self):
|
||||
result = merge_ordered(self.left, self.right, on="key", fill_method="ffill")
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"],
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
|
||||
"rvalue": [nan, 1, 2, 3, 3, 4],
|
||||
}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_multigroup(self):
|
||||
left = pd.concat([self.left, self.left], ignore_index=True)
|
||||
|
||||
left["group"] = ["a"] * 3 + ["b"] * 3
|
||||
|
||||
result = merge_ordered(
|
||||
left, self.right, on="key", left_by="group", fill_method="ffill"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key": ["a", "b", "c", "d", "e", "f"] * 2,
|
||||
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
|
||||
"rvalue": [nan, 1, 2, 3, 3, 4] * 2,
|
||||
}
|
||||
)
|
||||
expected["group"] = ["a"] * 6 + ["b"] * 6
|
||||
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
result2 = merge_ordered(
|
||||
self.right, left, on="key", right_by="group", fill_method="ffill"
|
||||
)
|
||||
assert_frame_equal(result, result2.loc[:, result.columns])
|
||||
|
||||
result = merge_ordered(left, self.right, on="key", left_by="group")
|
||||
assert result["group"].notna().all()
|
||||
|
||||
def test_merge_type(self):
|
||||
class NotADataFrame(DataFrame):
|
||||
@property
|
||||
def _constructor(self):
|
||||
return NotADataFrame
|
||||
|
||||
nad = NotADataFrame(self.left)
|
||||
result = nad.merge(self.right, on="key")
|
||||
|
||||
assert isinstance(result, NotADataFrame)
|
||||
|
||||
def test_empty_sequence_concat(self):
|
||||
# GH 9157
|
||||
empty_pat = "[Nn]o objects"
|
||||
none_pat = "objects.*None"
|
||||
test_cases = [
|
||||
((), empty_pat),
|
||||
([], empty_pat),
|
||||
({}, empty_pat),
|
||||
([None], none_pat),
|
||||
([None, None], none_pat),
|
||||
]
|
||||
for df_seq, pattern in test_cases:
|
||||
with pytest.raises(ValueError, match=pattern):
|
||||
pd.concat(df_seq)
|
||||
|
||||
pd.concat([pd.DataFrame()])
|
||||
pd.concat([None, pd.DataFrame()])
|
||||
pd.concat([pd.DataFrame(), None])
|
||||
|
||||
def test_doc_example(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"group": list("aaabbb"),
|
||||
"key": ["a", "c", "e", "a", "c", "e"],
|
||||
"lvalue": [1, 2, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
|
||||
|
||||
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"group": list("aaaaabbbbb"),
|
||||
"key": ["a", "b", "c", "d", "e"] * 2,
|
||||
"lvalue": [1, 1, 2, 2, 3] * 2,
|
||||
"rvalue": [nan, 1, 2, 3, 3] * 2,
|
||||
}
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,810 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
from numpy.random import randn
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series
|
||||
from pandas.core.reshape.concat import concat
|
||||
from pandas.core.reshape.merge import merge
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
"""left dataframe (not multi-indexed) for multi-index join tests"""
|
||||
# a little relevant example with NAs
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
return DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
"""right dataframe (multi-indexed) for multi-index join tests"""
|
||||
index = MultiIndex(
|
||||
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=["key1", "key2"],
|
||||
)
|
||||
|
||||
return DataFrame(
|
||||
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left_multi():
|
||||
return DataFrame(
|
||||
dict(
|
||||
Origin=["A", "A", "B", "B", "C"],
|
||||
Destination=["A", "B", "A", "C", "A"],
|
||||
Period=["AM", "AM", "IP", "AM", "OP"],
|
||||
TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"],
|
||||
Trips=[1987, 3647, 2470, 4296, 4444],
|
||||
),
|
||||
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
|
||||
).set_index(["Origin", "Destination", "Period", "TripPurp"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right_multi():
|
||||
return DataFrame(
|
||||
dict(
|
||||
Origin=["A", "A", "B", "B", "C", "C", "E"],
|
||||
Destination=["A", "B", "A", "B", "A", "B", "F"],
|
||||
Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
|
||||
LinkType=["a", "b", "c", "b", "a", "b", "a"],
|
||||
Distance=[100, 80, 90, 80, 75, 35, 55],
|
||||
),
|
||||
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
|
||||
).set_index(["Origin", "Destination", "Period", "LinkType"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def on_cols_multi():
|
||||
return ["Origin", "Destination", "Period"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def idx_cols_multi():
|
||||
return ["Origin", "Destination", "Period", "TripPurp", "LinkType"]
|
||||
|
||||
|
||||
class TestMergeMulti:
|
||||
def setup_method(self):
|
||||
self.index = MultiIndex(
|
||||
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
||||
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
||||
names=["first", "second"],
|
||||
)
|
||||
self.to_join = DataFrame(
|
||||
np.random.randn(10, 3),
|
||||
index=self.index,
|
||||
columns=["j_one", "j_two", "j_three"],
|
||||
)
|
||||
|
||||
# a little relevant example with NAs
|
||||
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
|
||||
key2 = [
|
||||
"two",
|
||||
"one",
|
||||
"three",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"three",
|
||||
"one",
|
||||
]
|
||||
|
||||
data = np.random.randn(len(key1))
|
||||
self.data = DataFrame({"key1": key1, "key2": key2, "data": data})
|
||||
|
||||
def test_merge_on_multikey(self, left, right, join_type):
|
||||
on_cols = ["key1", "key2"]
|
||||
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
|
||||
|
||||
expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
expected = pd.merge(
|
||||
left, right.reset_index(), on=on_cols, how=join_type, sort=True
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_left_join_multi_index(self, left, right, sort):
|
||||
icols = ["1st", "2nd", "3rd"]
|
||||
|
||||
def bind_cols(df):
|
||||
iord = lambda a: 0 if a != a else ord(a)
|
||||
f = lambda ts: ts.map(iord) - ord("a")
|
||||
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4
|
||||
|
||||
def run_asserts(left, right, sort):
|
||||
res = left.join(right, on=icols, how="left", sort=sort)
|
||||
|
||||
assert len(left) < len(res) + 1
|
||||
assert not res["4th"].isna().any()
|
||||
assert not res["5th"].isna().any()
|
||||
|
||||
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
|
||||
result = bind_cols(res.iloc[:, :-2])
|
||||
tm.assert_series_equal(res["4th"], result, check_names=False)
|
||||
assert result.name is None
|
||||
|
||||
if sort:
|
||||
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
|
||||
|
||||
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
|
||||
|
||||
res.index = np.arange(len(res))
|
||||
tm.assert_frame_equal(out, res)
|
||||
|
||||
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
|
||||
left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"])
|
||||
left.insert(1, "2nd", np.random.randint(0, 1000, len(left)))
|
||||
|
||||
i = np.random.permutation(len(left))
|
||||
right = left.iloc[i].copy()
|
||||
|
||||
left["4th"] = bind_cols(left)
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
# inject some nulls
|
||||
left.loc[1::23, "1st"] = np.nan
|
||||
left.loc[2::37, "2nd"] = np.nan
|
||||
left.loc[3::43, "3rd"] = np.nan
|
||||
left["4th"] = bind_cols(left)
|
||||
|
||||
i = np.random.permutation(len(left))
|
||||
right = left.iloc[i, :-1]
|
||||
right["5th"] = -bind_cols(right)
|
||||
right.set_index(icols, inplace=True)
|
||||
|
||||
run_asserts(left, right, sort)
|
||||
|
||||
@pytest.mark.parametrize("sort", [False, True])
|
||||
def test_merge_right_vs_left(self, left, right, sort):
|
||||
# compare left vs right merge with multikey
|
||||
on_cols = ["key1", "key2"]
|
||||
merged_left_right = left.merge(
|
||||
right, left_on=on_cols, right_index=True, how="left", sort=sort
|
||||
)
|
||||
|
||||
merge_right_left = right.merge(
|
||||
left, right_on=on_cols, left_index=True, how="right", sort=sort
|
||||
)
|
||||
|
||||
# Reorder columns
|
||||
merge_right_left = merge_right_left[merged_left_right.columns]
|
||||
|
||||
tm.assert_frame_equal(merged_left_right, merge_right_left)
|
||||
|
||||
def test_compress_group_combinations(self):
|
||||
|
||||
# ~ 40000000 possible unique groups
|
||||
key1 = tm.rands_array(10, 10000)
|
||||
key1 = np.tile(key1, 2)
|
||||
key2 = key1[::-1]
|
||||
|
||||
df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})
|
||||
|
||||
df2 = DataFrame(
|
||||
{"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}
|
||||
)
|
||||
|
||||
# just to hit the label compression code path
|
||||
merge(df, df2, how="outer")
|
||||
|
||||
def test_left_join_index_preserve_order(self):
|
||||
|
||||
on_cols = ["k1", "k2"]
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"v": np.array(np.arange(24), dtype=np.int64),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result.sort_values(on_cols, kind="mergesort", inplace=True)
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# test join with multi dtypes blocks
|
||||
left = DataFrame(
|
||||
{
|
||||
"k1": [0, 1, 2] * 8,
|
||||
"k2": ["foo", "bar"] * 12,
|
||||
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
|
||||
"v": np.array(np.arange(24), dtype=np.int32),
|
||||
}
|
||||
)
|
||||
|
||||
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
|
||||
right = DataFrame({"v2": [5, 7]}, index=index)
|
||||
|
||||
result = left.join(right, on=on_cols)
|
||||
|
||||
expected = left.copy()
|
||||
expected["v2"] = np.nan
|
||||
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
|
||||
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = result.sort_values(on_cols, kind="mergesort")
|
||||
expected = left.join(right, on=on_cols, sort=True)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match_multiindex(self):
|
||||
left = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a"],
|
||||
["W", "Y", "C", "e"],
|
||||
["V", "Q", "A", "h"],
|
||||
["V", "R", "D", "i"],
|
||||
["X", "Y", "D", "b"],
|
||||
["X", "Y", "A", "c"],
|
||||
["W", "Q", "B", "f"],
|
||||
["W", "R", "C", "g"],
|
||||
["V", "Y", "C", "j"],
|
||||
["X", "Y", "B", "d"],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag"],
|
||||
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["W", "R", "C", 0],
|
||||
["W", "Q", "B", 3],
|
||||
["W", "Q", "B", 8],
|
||||
["X", "Y", "A", 1],
|
||||
["X", "Y", "A", 4],
|
||||
["X", "Y", "B", 5],
|
||||
["X", "Y", "C", 6],
|
||||
["X", "Y", "C", 9],
|
||||
["X", "Q", "C", -6],
|
||||
["X", "R", "C", -9],
|
||||
["V", "Y", "C", 7],
|
||||
["V", "R", "D", 2],
|
||||
["V", "R", "D", -1],
|
||||
["V", "Q", "A", -3],
|
||||
],
|
||||
columns=["col1", "col2", "col3", "val"],
|
||||
).set_index(["col1", "col2", "col3"])
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["X", "Y", "C", "a", 6],
|
||||
["X", "Y", "C", "a", 9],
|
||||
["W", "Y", "C", "e", nan],
|
||||
["V", "Q", "A", "h", -3],
|
||||
["V", "R", "D", "i", 2],
|
||||
["V", "R", "D", "i", -1],
|
||||
["X", "Y", "D", "b", nan],
|
||||
["X", "Y", "A", "c", 1],
|
||||
["X", "Y", "A", "c", 4],
|
||||
["W", "Q", "B", "f", 3],
|
||||
["W", "Q", "B", "f", 8],
|
||||
["W", "R", "C", "g", 0],
|
||||
["V", "Y", "C", "j", 7],
|
||||
["X", "Y", "B", "d", 5],
|
||||
],
|
||||
columns=["cola", "colb", "colc", "tag", "val"],
|
||||
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
|
||||
|
||||
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_join_index_multi_match(self):
|
||||
left = DataFrame(
|
||||
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
|
||||
columns=["tag", "val"],
|
||||
index=[2, 0, 1, 3],
|
||||
)
|
||||
|
||||
right = DataFrame(
|
||||
[
|
||||
["a", "v"],
|
||||
["c", "w"],
|
||||
["c", "x"],
|
||||
["d", "y"],
|
||||
["a", "z"],
|
||||
["c", "r"],
|
||||
["e", "q"],
|
||||
["c", "s"],
|
||||
],
|
||||
columns=["tag", "char"],
|
||||
).set_index("tag")
|
||||
|
||||
result = left.join(right, on="tag", how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
["c", 0, "w"],
|
||||
["c", 0, "x"],
|
||||
["c", 0, "r"],
|
||||
["c", 0, "s"],
|
||||
["b", 1, nan],
|
||||
["a", 2, "v"],
|
||||
["a", 2, "z"],
|
||||
["b", 3, nan],
|
||||
],
|
||||
columns=["tag", "val", "char"],
|
||||
index=[2, 2, 2, 2, 0, 1, 1, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = left.join(right, on="tag", how="left", sort=True)
|
||||
expected2 = expected.sort_values("tag", kind="mergesort")
|
||||
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
|
||||
# GH7331 - maintain left frame order in left merge
|
||||
result = merge(left, right.reset_index(), how="left", on="tag")
|
||||
expected.index = np.arange(len(expected))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_left_merge_na_buglet(self):
|
||||
left = DataFrame(
|
||||
{
|
||||
"id": list("abcde"),
|
||||
"v1": randn(5),
|
||||
"v2": randn(5),
|
||||
"dummy": list("abcde"),
|
||||
"v3": randn(5),
|
||||
},
|
||||
columns=["id", "v1", "v2", "dummy", "v3"],
|
||||
)
|
||||
right = DataFrame(
|
||||
{
|
||||
"id": ["a", "b", np.nan, np.nan, np.nan],
|
||||
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
result = merge(left, right, on="id", how="left")
|
||||
|
||||
rdf = right.drop(["id"], axis=1)
|
||||
expected = left.join(rdf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_merge_na_keys(self):
|
||||
data = [
|
||||
[1950, "A", 1.5],
|
||||
[1950, "B", 1.5],
|
||||
[1955, "B", 1.5],
|
||||
[1960, "B", np.nan],
|
||||
[1970, "B", 4.0],
|
||||
[1950, "C", 4.0],
|
||||
[1960, "C", np.nan],
|
||||
[1965, "C", 3.0],
|
||||
[1970, "C", 4.0],
|
||||
]
|
||||
|
||||
frame = DataFrame(data, columns=["year", "panel", "data"])
|
||||
|
||||
other_data = [
|
||||
[1960, "A", np.nan],
|
||||
[1970, "A", np.nan],
|
||||
[1955, "A", np.nan],
|
||||
[1965, "A", np.nan],
|
||||
[1965, "B", np.nan],
|
||||
[1955, "C", np.nan],
|
||||
]
|
||||
other = DataFrame(other_data, columns=["year", "panel", "data"])
|
||||
|
||||
result = frame.merge(other, how="outer")
|
||||
|
||||
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
|
||||
expected = expected.replace(-999, np.nan)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, klass):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if klass is not None:
|
||||
on_vector = klass(on_vector)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])])
|
||||
)
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict(
|
||||
[("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])]
|
||||
)
|
||||
)
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_levels(self):
|
||||
|
||||
# GH 3662
|
||||
# merge multi-levels
|
||||
household = DataFrame(
|
||||
dict(
|
||||
household_id=[1, 2, 3],
|
||||
male=[0, 1, 0],
|
||||
wealth=[196087.3, 316478.7, 294750],
|
||||
),
|
||||
columns=["household_id", "male", "wealth"],
|
||||
).set_index("household_id")
|
||||
portfolio = DataFrame(
|
||||
dict(
|
||||
household_id=[1, 2, 2, 3, 3, 3, 4],
|
||||
asset_id=[
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
name=[
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
np.nan,
|
||||
],
|
||||
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
),
|
||||
columns=["household_id", "asset_id", "name", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
result = household.join(portfolio, how="inner")
|
||||
expected = (
|
||||
DataFrame(
|
||||
dict(
|
||||
male=[0, 1, 1, 0, 0, 0],
|
||||
wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0],
|
||||
name=[
|
||||
"ABN Amro",
|
||||
"Robeco",
|
||||
"Royal Dutch Shell",
|
||||
"Royal Dutch Shell",
|
||||
"AAB Eastern Europe Equity Fund",
|
||||
"Postbank BioTech Fonds",
|
||||
],
|
||||
share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
|
||||
household_id=[1, 2, 2, 3, 3, 3],
|
||||
asset_id=[
|
||||
"nl0000301109",
|
||||
"nl0000289783",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
],
|
||||
)
|
||||
)
|
||||
.set_index(["household_id", "asset_id"])
|
||||
.reindex(columns=["male", "wealth", "name", "share"])
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
portfolio.reset_index(),
|
||||
on=["household_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = household.join(portfolio, how="outer")
|
||||
expected = concat(
|
||||
[
|
||||
expected,
|
||||
(
|
||||
DataFrame(
|
||||
dict(share=[1.00]),
|
||||
index=MultiIndex.from_tuples(
|
||||
[(4, np.nan)], names=["household_id", "asset_id"]
|
||||
),
|
||||
)
|
||||
),
|
||||
],
|
||||
axis=0,
|
||||
sort=True,
|
||||
).reindex(columns=expected.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# invalid cases
|
||||
household.index.name = "foo"
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
household.join(portfolio, how="inner")
|
||||
|
||||
portfolio2 = portfolio.copy()
|
||||
portfolio2.index.set_names(["household_id", "foo"])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
portfolio2.join(portfolio, how="inner")
|
||||
|
||||
def test_join_multi_levels2(self):
|
||||
|
||||
# some more advanced merges
|
||||
# GH6360
|
||||
household = DataFrame(
|
||||
dict(
|
||||
household_id=[1, 2, 2, 3, 3, 3, 4],
|
||||
asset_id=[
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
np.nan,
|
||||
],
|
||||
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
|
||||
),
|
||||
columns=["household_id", "asset_id", "share"],
|
||||
).set_index(["household_id", "asset_id"])
|
||||
|
||||
log_return = DataFrame(
|
||||
dict(
|
||||
asset_id=[
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
t=[233, 234, 235, 180, 181],
|
||||
log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997],
|
||||
)
|
||||
).set_index(["asset_id", "t"])
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
dict(
|
||||
household_id=[2, 2, 2, 3, 3, 3, 3, 3],
|
||||
asset_id=[
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
],
|
||||
t=[233, 234, 235, 233, 234, 235, 180, 181],
|
||||
share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
|
||||
log_return=[
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
],
|
||||
)
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
# this is the equivalency
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="inner",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = (
|
||||
DataFrame(
|
||||
dict(
|
||||
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
|
||||
asset_id=[
|
||||
"nl0000301109",
|
||||
"nl0000301109",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"gb00b03mlx29",
|
||||
"lu0197800237",
|
||||
"lu0197800237",
|
||||
"nl0000289965",
|
||||
None,
|
||||
],
|
||||
t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
|
||||
share=[
|
||||
1.0,
|
||||
0.4,
|
||||
0.6,
|
||||
0.6,
|
||||
0.6,
|
||||
0.15,
|
||||
0.15,
|
||||
0.15,
|
||||
0.6,
|
||||
0.6,
|
||||
0.25,
|
||||
1.0,
|
||||
],
|
||||
log_return=[
|
||||
None,
|
||||
None,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.09604978,
|
||||
-0.06524096,
|
||||
0.03532373,
|
||||
0.03025441,
|
||||
0.036997,
|
||||
None,
|
||||
None,
|
||||
],
|
||||
)
|
||||
)
|
||||
.set_index(["household_id", "asset_id", "t"])
|
||||
.reindex(columns=["share", "log_return"])
|
||||
)
|
||||
|
||||
result = merge(
|
||||
household.reset_index(),
|
||||
log_return.reset_index(),
|
||||
on=["asset_id"],
|
||||
how="outer",
|
||||
).set_index(["household_id", "asset_id", "t"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestJoinMultiMulti:
|
||||
def test_join_multi_multi(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
|
||||
):
|
||||
# Multi-index join tests
|
||||
expected = (
|
||||
pd.merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(idx_cols_multi)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_multi_empty_frames(
|
||||
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
|
||||
):
|
||||
|
||||
left_multi = left_multi.drop(columns=left_multi.columns)
|
||||
right_multi = right_multi.drop(columns=right_multi.columns)
|
||||
|
||||
expected = (
|
||||
pd.merge(
|
||||
left_multi.reset_index(),
|
||||
right_multi.reset_index(),
|
||||
how=join_type,
|
||||
on=on_cols_multi,
|
||||
)
|
||||
.set_index(idx_cols_multi)
|
||||
.sort_index()
|
||||
)
|
||||
|
||||
result = left_multi.join(right_multi, how=join_type).sort_index()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
|
||||
def test_merge_datetime_index(self, box):
|
||||
# see gh-19038
|
||||
df = DataFrame(
|
||||
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
|
||||
)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
on_vector = df.index.year
|
||||
|
||||
if box is not None:
|
||||
on_vector = box(on_vector)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])])
|
||||
)
|
||||
|
||||
result = df.merge(df, on=["a", on_vector], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
OrderedDict(
|
||||
[("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])]
|
||||
)
|
||||
)
|
||||
|
||||
result = df.merge(df, on=[df.index.year], how="inner")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_common_level(self):
|
||||
index_left = pd.MultiIndex.from_tuples(
|
||||
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
|
||||
)
|
||||
|
||||
left = pd.DataFrame(
|
||||
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
|
||||
)
|
||||
|
||||
index_right = pd.MultiIndex.from_tuples(
|
||||
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
|
||||
)
|
||||
|
||||
right = pd.DataFrame(
|
||||
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
|
||||
index=index_right,
|
||||
)
|
||||
|
||||
result = left.join(right)
|
||||
expected = pd.merge(
|
||||
left.reset_index(), right.reset_index(), on=["key"], how="inner"
|
||||
).set_index(["key", "X", "Y"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
2763
venv/lib/python3.6/site-packages/pandas/tests/reshape/test_concat.py
Normal file
2763
venv/lib/python3.6/site-packages/pandas/tests/reshape/test_concat.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,587 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
isna,
|
||||
qcut,
|
||||
timedelta_range,
|
||||
to_datetime,
|
||||
)
|
||||
from pandas.api.types import CategoricalDtype as CDT
|
||||
import pandas.core.reshape.tile as tmod
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_simple():
|
||||
data = np.ones(5, dtype="int64")
|
||||
result = cut(data, 4, labels=False)
|
||||
|
||||
expected = np.array([1, 1, 1, 1, 1])
|
||||
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
def test_bins():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
|
||||
|
||||
|
||||
def test_right():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=True, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
|
||||
|
||||
|
||||
def test_no_right():
|
||||
data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
|
||||
result, bins = cut(data, 4, right=False, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
|
||||
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
|
||||
|
||||
|
||||
def test_array_like():
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
result, bins = cut(data, 3, retbins=True)
|
||||
|
||||
intervals = IntervalIndex.from_breaks(bins.round(3))
|
||||
intervals = intervals.take([0, 0, 0, 1, 2, 0])
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
|
||||
|
||||
|
||||
def test_bins_from_interval_index():
|
||||
c = cut(range(5), 3)
|
||||
expected = c
|
||||
result = cut(range(5), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
expected = Categorical.from_codes(
|
||||
np.append(c.codes, -1), categories=c.categories, ordered=True
|
||||
)
|
||||
result = cut(range(6), bins=expected.categories)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_bins_from_interval_index_doc_example():
|
||||
# Make sure we preserve the bins.
|
||||
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
|
||||
c = cut(ages, bins=[0, 18, 35, 70])
|
||||
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
|
||||
tm.assert_index_equal(c.categories, expected)
|
||||
|
||||
result = cut([25, 20, 50], bins=c.categories)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
|
||||
|
||||
|
||||
def test_bins_not_overlapping_from_interval_index():
|
||||
# see gh-23980
|
||||
msg = "Overlapping IntervalIndex is not accepted"
|
||||
ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut([5, 6], bins=ii)
|
||||
|
||||
|
||||
def test_bins_not_monotonic():
|
||||
msg = "bins must increase monotonically"
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0.1, 1.5, 1, 10])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"x, bins, expected",
|
||||
[
|
||||
(
|
||||
date_range("2017-12-31", periods=3),
|
||||
[Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
|
||||
IntervalIndex.from_tuples(
|
||||
[
|
||||
(Timestamp.min, Timestamp("2018-01-01")),
|
||||
(Timestamp("2018-01-01"), Timestamp.max),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
[-1, 0, 1],
|
||||
np.array(
|
||||
[np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
|
||||
),
|
||||
IntervalIndex.from_tuples(
|
||||
[(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
|
||||
),
|
||||
),
|
||||
(
|
||||
[np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)],
|
||||
np.array(
|
||||
[
|
||||
np.timedelta64(-np.iinfo(np.int64).max),
|
||||
np.timedelta64(0),
|
||||
np.timedelta64(np.iinfo(np.int64).max),
|
||||
]
|
||||
),
|
||||
IntervalIndex.from_tuples(
|
||||
[
|
||||
(np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)),
|
||||
(np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_bins_monotonic_not_overflowing(x, bins, expected):
|
||||
# GH 26045
|
||||
result = cut(x, bins)
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
def test_wrong_num_labels():
|
||||
msg = "Bin labels must be one fewer than the number of bin edges"
|
||||
data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"x,bins,msg",
|
||||
[
|
||||
([], 2, "Cannot cut empty array"),
|
||||
([1, 2, 3], 0.5, "`bins` should be a positive integer"),
|
||||
],
|
||||
)
|
||||
def test_cut_corner(x, bins, msg):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(x, bins)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
|
||||
@pytest.mark.parametrize("cut_func", [cut, qcut])
|
||||
def test_cut_not_1d_arg(arg, cut_func):
|
||||
msg = "Input array must be 1 dimensional"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut_func(arg, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[0, 1, 2, 3, 4, np.inf],
|
||||
[-np.inf, 0, 1, 2, 3, 4],
|
||||
[-np.inf, 0, 1, 2, 3, 4, np.inf],
|
||||
],
|
||||
)
|
||||
def test_int_bins_with_inf(data):
|
||||
# GH 24314
|
||||
msg = "cannot specify integer `bins` when input data contains infinity"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(data, bins=3)
|
||||
|
||||
|
||||
def test_cut_out_of_range_more():
|
||||
# see gh-1511
|
||||
name = "x"
|
||||
|
||||
ser = Series([0, -1, 0, 1, -3], name=name)
|
||||
ind = cut(ser, [0, 1], labels=False)
|
||||
|
||||
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
|
||||
tm.assert_series_equal(ind, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"right,breaks,closed",
|
||||
[
|
||||
(True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
|
||||
(False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
|
||||
],
|
||||
)
|
||||
def test_labels(right, breaks, closed):
|
||||
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
|
||||
|
||||
result, bins = cut(arr, 4, retbins=True, right=right)
|
||||
ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
def test_cut_pass_series_name_to_factor():
|
||||
name = "foo"
|
||||
ser = Series(np.random.randn(100), name=name)
|
||||
|
||||
factor = cut(ser, 4)
|
||||
assert factor.name == name
|
||||
|
||||
|
||||
def test_label_precision():
|
||||
arr = np.arange(0, 0.73, 0.01)
|
||||
result = cut(arr, 4, precision=2)
|
||||
|
||||
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
|
||||
tm.assert_index_equal(result.categories, ex_levels)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_na_handling(labels):
|
||||
arr = np.arange(0, 0.75, 0.01)
|
||||
arr[::3] = np.nan
|
||||
|
||||
result = cut(arr, 4, labels=labels)
|
||||
result = np.asarray(result)
|
||||
|
||||
expected = np.where(isna(arr), np.nan, result)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_handling():
|
||||
data = np.arange(6)
|
||||
data_ser = Series(data, dtype="int64")
|
||||
|
||||
bins = [-np.inf, 2, 4, np.inf]
|
||||
result = cut(data, bins)
|
||||
result_ser = cut(data_ser, bins)
|
||||
|
||||
ex_uniques = IntervalIndex.from_breaks(bins)
|
||||
tm.assert_index_equal(result.categories, ex_uniques)
|
||||
|
||||
assert result[5] == Interval(4, np.inf)
|
||||
assert result[0] == Interval(-np.inf, 2)
|
||||
assert result_ser[5] == Interval(4, np.inf)
|
||||
assert result_ser[0] == Interval(-np.inf, 2)
|
||||
|
||||
|
||||
def test_cut_out_of_bounds():
|
||||
arr = np.random.randn(100)
|
||||
result = cut(arr, [-1, 0, 1])
|
||||
|
||||
mask = isna(result)
|
||||
ex_mask = (arr < -1) | (arr > 1)
|
||||
tm.assert_numpy_array_equal(mask, ex_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"get_labels,get_expected",
|
||||
[
|
||||
(
|
||||
lambda labels: labels,
|
||||
lambda labels: Categorical(
|
||||
["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
|
||||
categories=labels,
|
||||
ordered=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
lambda labels: Categorical.from_codes([0, 1, 2], labels),
|
||||
lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_cut_pass_labels(get_labels, get_expected):
|
||||
bins = [0, 25, 50, 100]
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Small", "Medium", "Large"]
|
||||
|
||||
result = cut(arr, bins, labels=get_labels(labels))
|
||||
tm.assert_categorical_equal(result, get_expected(labels))
|
||||
|
||||
|
||||
def test_cut_pass_labels_compat():
|
||||
# see gh-16459
|
||||
arr = [50, 5, 10, 15, 20, 30, 70]
|
||||
labels = ["Good", "Medium", "Bad"]
|
||||
|
||||
result = cut(arr, 3, labels=labels)
|
||||
exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
|
||||
tm.assert_categorical_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
|
||||
def test_round_frac_just_works(x):
|
||||
# It works.
|
||||
cut(x, 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val,precision,expected",
|
||||
[
|
||||
(-117.9998, 3, -118),
|
||||
(117.9998, 3, 118),
|
||||
(117.9998, 2, 118),
|
||||
(0.000123456, 2, 0.00012),
|
||||
],
|
||||
)
|
||||
def test_round_frac(val, precision, expected):
|
||||
# see gh-1979
|
||||
result = tmod._round_frac(val, precision=precision)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_cut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
result = cut(ser, 3)
|
||||
|
||||
exp_bins = np.linspace(0, 8, num=4).round(3)
|
||||
exp_bins[0] -= 0.008
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex.from_breaks(exp_bins, closed="right").take(
|
||||
[0, 0, 0, 1, 1, 1, 2, 2, 2]
|
||||
)
|
||||
).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_ret_bins():
|
||||
# see gh-8589
|
||||
ser = Series(np.arange(4))
|
||||
result, bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
|
||||
).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(dict(duplicates="drop"), None),
|
||||
(dict(), "Bin edges must be unique"),
|
||||
(dict(duplicates="raise"), "Bin edges must be unique"),
|
||||
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
|
||||
],
|
||||
)
|
||||
def test_cut_duplicates_bin(kwargs, msg):
|
||||
# see gh-20947
|
||||
bins = [0, 2, 4, 6, 10, 10]
|
||||
values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(values, bins, **kwargs)
|
||||
else:
|
||||
result = cut(values, bins, **kwargs)
|
||||
expected = cut(values, pd.unique(bins))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
def test_single_bin(data, length):
|
||||
# see gh-14652, gh-15428
|
||||
ser = Series([data] * length)
|
||||
result = cut(ser, 1, labels=False)
|
||||
|
||||
expected = Series([0] * length)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
|
||||
)
|
||||
def test_cut_read_only(array_1_writeable, array_2_writeable):
|
||||
# issue 18773
|
||||
array_1 = np.arange(0, 100, 10)
|
||||
array_1.flags.writeable = array_1_writeable
|
||||
|
||||
array_2 = np.arange(0, 100, 10)
|
||||
array_2.flags.writeable = array_2_writeable
|
||||
|
||||
hundred_elements = np.arange(100)
|
||||
tm.assert_categorical_equal(
|
||||
cut(hundred_elements, array_1), cut(hundred_elements, array_2)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"conv",
|
||||
[
|
||||
lambda v: Timestamp(v),
|
||||
lambda v: to_datetime(v),
|
||||
lambda v: np.datetime64(v),
|
||||
lambda v: Timestamp(v).to_pydatetime(),
|
||||
],
|
||||
)
|
||||
def test_datetime_bin(conv):
|
||||
data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
|
||||
bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
|
||||
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
|
||||
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
|
||||
]
|
||||
)
|
||||
).astype(CDT(ordered=True))
|
||||
|
||||
bins = [conv(v) for v in bin_data]
|
||||
result = Series(cut(data, bins=bins))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
|
||||
[
|
||||
np.datetime64("2013-01-01"),
|
||||
np.datetime64("2013-01-02"),
|
||||
np.datetime64("2013-01-03"),
|
||||
],
|
||||
np.array(
|
||||
[
|
||||
np.datetime64("2013-01-01"),
|
||||
np.datetime64("2013-01-02"),
|
||||
np.datetime64("2013-01-03"),
|
||||
]
|
||||
),
|
||||
DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]),
|
||||
],
|
||||
)
|
||||
def test_datetime_cut(data):
|
||||
# see gh-14714
|
||||
#
|
||||
# Testing time data when it comes in various collection types.
|
||||
result, _ = cut(data, 3, retbins=True)
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:57:07.200000"),
|
||||
Timestamp("2013-01-01 16:00:00"),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00")
|
||||
),
|
||||
]
|
||||
)
|
||||
).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(Series(result), expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bins",
|
||||
[
|
||||
3,
|
||||
[
|
||||
Timestamp("2013-01-01 04:57:07.200000"),
|
||||
Timestamp("2013-01-01 21:00:00"),
|
||||
Timestamp("2013-01-02 13:00:00"),
|
||||
Timestamp("2013-01-03 05:00:00"),
|
||||
],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("box", [list, np.array, Index, Series])
|
||||
def test_datetime_tz_cut(bins, box):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
s = Series(date_range("20130101", periods=3, tz=tz))
|
||||
|
||||
if not isinstance(bins, int):
|
||||
bins = box(bins)
|
||||
|
||||
result = cut(s, bins)
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:57:07.200000", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||||
),
|
||||
]
|
||||
)
|
||||
).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_datetime_nan_error():
|
||||
msg = "bins must be of datetime64 dtype"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
cut(date_range("20130101", periods=3), bins=[0, 2, 4])
|
||||
|
||||
|
||||
def test_datetime_nan_mask():
|
||||
result = cut(
|
||||
date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
|
||||
)
|
||||
|
||||
mask = result.categories.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False]))
|
||||
|
||||
mask = result.isna()
|
||||
tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
|
||||
def test_datetime_cut_roundtrip(tz):
|
||||
# see gh-19891
|
||||
ser = Series(date_range("20180101", periods=3, tz=tz))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected_bins = DatetimeIndex(
|
||||
["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"]
|
||||
)
|
||||
expected_bins = expected_bins.tz_localize(tz)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
|
||||
|
||||
def test_timedelta_cut_roundtrip():
|
||||
# see gh-19891
|
||||
ser = Series(timedelta_range("1day", periods=3))
|
||||
result, result_bins = cut(ser, 2, retbins=True)
|
||||
|
||||
expected = cut(ser, result_bins)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected_bins = TimedeltaIndex(
|
||||
["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
|
||||
)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
@@ -0,0 +1,964 @@
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, lreshape, melt, wide_to_long
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestMelt:
|
||||
def setup_method(self, method):
|
||||
self.df = tm.makeTimeDataFrame()[:10]
|
||||
self.df["id1"] = (self.df["A"] > 0).astype(np.int64)
|
||||
self.df["id2"] = (self.df["B"] > 0).astype(np.int64)
|
||||
|
||||
self.var_name = "var"
|
||||
self.value_name = "val"
|
||||
|
||||
self.df1 = pd.DataFrame(
|
||||
[
|
||||
[1.067683, -1.110463, 0.20867],
|
||||
[-1.321405, 0.368915, -1.055342],
|
||||
[-0.807333, 0.08298, -0.873361],
|
||||
]
|
||||
)
|
||||
self.df1.columns = [list("ABC"), list("abc")]
|
||||
self.df1.columns.names = ["CAP", "low"]
|
||||
|
||||
def test_top_level_method(self):
|
||||
result = melt(self.df)
|
||||
assert result.columns.tolist() == ["variable", "value"]
|
||||
|
||||
def test_method_signatures(self):
|
||||
tm.assert_frame_equal(self.df.melt(), melt(self.df))
|
||||
|
||||
tm.assert_frame_equal(
|
||||
self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]),
|
||||
melt(self.df, id_vars=["id1", "id2"], value_vars=["A", "B"]),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(
|
||||
self.df.melt(var_name=self.var_name, value_name=self.value_name),
|
||||
melt(self.df, var_name=self.var_name, value_name=self.value_name),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(self.df1.melt(col_level=0), melt(self.df1, col_level=0))
|
||||
|
||||
def test_default_col_names(self):
|
||||
result = self.df.melt()
|
||||
assert result.columns.tolist() == ["variable", "value"]
|
||||
|
||||
result1 = self.df.melt(id_vars=["id1"])
|
||||
assert result1.columns.tolist() == ["id1", "variable", "value"]
|
||||
|
||||
result2 = self.df.melt(id_vars=["id1", "id2"])
|
||||
assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
|
||||
|
||||
def test_value_vars(self):
|
||||
result3 = self.df.melt(id_vars=["id1", "id2"], value_vars="A")
|
||||
assert len(result3) == 10
|
||||
|
||||
result4 = self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"])
|
||||
expected4 = DataFrame(
|
||||
{
|
||||
"id1": self.df["id1"].tolist() * 2,
|
||||
"id2": self.df["id2"].tolist() * 2,
|
||||
"variable": ["A"] * 10 + ["B"] * 10,
|
||||
"value": (self.df["A"].tolist() + self.df["B"].tolist()),
|
||||
},
|
||||
columns=["id1", "id2", "variable", "value"],
|
||||
)
|
||||
tm.assert_frame_equal(result4, expected4)
|
||||
|
||||
def test_value_vars_types(self):
|
||||
# GH 15348
|
||||
expected = DataFrame(
|
||||
{
|
||||
"id1": self.df["id1"].tolist() * 2,
|
||||
"id2": self.df["id2"].tolist() * 2,
|
||||
"variable": ["A"] * 10 + ["B"] * 10,
|
||||
"value": (self.df["A"].tolist() + self.df["B"].tolist()),
|
||||
},
|
||||
columns=["id1", "id2", "variable", "value"],
|
||||
)
|
||||
|
||||
for type_ in (tuple, list, np.array):
|
||||
result = self.df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_vars_work_with_multiindex(self):
|
||||
expected = DataFrame(
|
||||
{
|
||||
("A", "a"): self.df1[("A", "a")],
|
||||
"CAP": ["B"] * len(self.df1),
|
||||
"low": ["b"] * len(self.df1),
|
||||
"value": self.df1[("B", "b")],
|
||||
},
|
||||
columns=[("A", "a"), "CAP", "low", "value"],
|
||||
)
|
||||
|
||||
result = self.df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_single_vars_work_with_multiindex(self):
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": {0: 1.067683, 1: -1.321405, 2: -0.807333},
|
||||
"CAP": {0: "B", 1: "B", 2: "B"},
|
||||
"value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
|
||||
}
|
||||
)
|
||||
result = self.df1.melt(["A"], ["B"], col_level=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_tuple_vars_fail_with_multiindex(self):
|
||||
# melt should fail with an informative error message if
|
||||
# the columns have a MultiIndex and a tuple is passed
|
||||
# for id_vars or value_vars.
|
||||
tuple_a = ("A", "a")
|
||||
list_a = [tuple_a]
|
||||
tuple_b = ("B", "b")
|
||||
list_b = [tuple_b]
|
||||
|
||||
msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex"
|
||||
for id_vars, value_vars in (
|
||||
(tuple_a, list_b),
|
||||
(list_a, tuple_b),
|
||||
(tuple_a, tuple_b),
|
||||
):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
|
||||
|
||||
def test_custom_var_name(self):
|
||||
result5 = self.df.melt(var_name=self.var_name)
|
||||
assert result5.columns.tolist() == ["var", "value"]
|
||||
|
||||
result6 = self.df.melt(id_vars=["id1"], var_name=self.var_name)
|
||||
assert result6.columns.tolist() == ["id1", "var", "value"]
|
||||
|
||||
result7 = self.df.melt(id_vars=["id1", "id2"], var_name=self.var_name)
|
||||
assert result7.columns.tolist() == ["id1", "id2", "var", "value"]
|
||||
|
||||
result8 = self.df.melt(
|
||||
id_vars=["id1", "id2"], value_vars="A", var_name=self.var_name
|
||||
)
|
||||
assert result8.columns.tolist() == ["id1", "id2", "var", "value"]
|
||||
|
||||
result9 = self.df.melt(
|
||||
id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=self.var_name
|
||||
)
|
||||
expected9 = DataFrame(
|
||||
{
|
||||
"id1": self.df["id1"].tolist() * 2,
|
||||
"id2": self.df["id2"].tolist() * 2,
|
||||
self.var_name: ["A"] * 10 + ["B"] * 10,
|
||||
"value": (self.df["A"].tolist() + self.df["B"].tolist()),
|
||||
},
|
||||
columns=["id1", "id2", self.var_name, "value"],
|
||||
)
|
||||
tm.assert_frame_equal(result9, expected9)
|
||||
|
||||
def test_custom_value_name(self):
|
||||
result10 = self.df.melt(value_name=self.value_name)
|
||||
assert result10.columns.tolist() == ["variable", "val"]
|
||||
|
||||
result11 = self.df.melt(id_vars=["id1"], value_name=self.value_name)
|
||||
assert result11.columns.tolist() == ["id1", "variable", "val"]
|
||||
|
||||
result12 = self.df.melt(id_vars=["id1", "id2"], value_name=self.value_name)
|
||||
assert result12.columns.tolist() == ["id1", "id2", "variable", "val"]
|
||||
|
||||
result13 = self.df.melt(
|
||||
id_vars=["id1", "id2"], value_vars="A", value_name=self.value_name
|
||||
)
|
||||
assert result13.columns.tolist() == ["id1", "id2", "variable", "val"]
|
||||
|
||||
result14 = self.df.melt(
|
||||
id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=self.value_name
|
||||
)
|
||||
expected14 = DataFrame(
|
||||
{
|
||||
"id1": self.df["id1"].tolist() * 2,
|
||||
"id2": self.df["id2"].tolist() * 2,
|
||||
"variable": ["A"] * 10 + ["B"] * 10,
|
||||
self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()),
|
||||
},
|
||||
columns=["id1", "id2", "variable", self.value_name],
|
||||
)
|
||||
tm.assert_frame_equal(result14, expected14)
|
||||
|
||||
def test_custom_var_and_value_name(self):
|
||||
|
||||
result15 = self.df.melt(var_name=self.var_name, value_name=self.value_name)
|
||||
assert result15.columns.tolist() == ["var", "val"]
|
||||
|
||||
result16 = self.df.melt(
|
||||
id_vars=["id1"], var_name=self.var_name, value_name=self.value_name
|
||||
)
|
||||
assert result16.columns.tolist() == ["id1", "var", "val"]
|
||||
|
||||
result17 = self.df.melt(
|
||||
id_vars=["id1", "id2"], var_name=self.var_name, value_name=self.value_name
|
||||
)
|
||||
assert result17.columns.tolist() == ["id1", "id2", "var", "val"]
|
||||
|
||||
result18 = self.df.melt(
|
||||
id_vars=["id1", "id2"],
|
||||
value_vars="A",
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name,
|
||||
)
|
||||
assert result18.columns.tolist() == ["id1", "id2", "var", "val"]
|
||||
|
||||
result19 = self.df.melt(
|
||||
id_vars=["id1", "id2"],
|
||||
value_vars=["A", "B"],
|
||||
var_name=self.var_name,
|
||||
value_name=self.value_name,
|
||||
)
|
||||
expected19 = DataFrame(
|
||||
{
|
||||
"id1": self.df["id1"].tolist() * 2,
|
||||
"id2": self.df["id2"].tolist() * 2,
|
||||
self.var_name: ["A"] * 10 + ["B"] * 10,
|
||||
self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()),
|
||||
},
|
||||
columns=["id1", "id2", self.var_name, self.value_name],
|
||||
)
|
||||
tm.assert_frame_equal(result19, expected19)
|
||||
|
||||
df20 = self.df.copy()
|
||||
df20.columns.name = "foo"
|
||||
result20 = df20.melt()
|
||||
assert result20.columns.tolist() == ["foo", "value"]
|
||||
|
||||
def test_col_level(self):
|
||||
res1 = self.df1.melt(col_level=0)
|
||||
res2 = self.df1.melt(col_level="CAP")
|
||||
assert res1.columns.tolist() == ["CAP", "value"]
|
||||
assert res2.columns.tolist() == ["CAP", "value"]
|
||||
|
||||
def test_multiindex(self):
|
||||
res = self.df1.melt()
|
||||
assert res.columns.tolist() == ["CAP", "low", "value"]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"col",
|
||||
[
|
||||
pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")),
|
||||
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
|
||||
pd.Series([0, 1, 0, 0, 0]),
|
||||
],
|
||||
)
|
||||
def test_pandas_dtypes(self, col):
|
||||
# GH 15785
|
||||
df = DataFrame(
|
||||
{"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col}
|
||||
)
|
||||
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True)
|
||||
result = melt(
|
||||
df, id_vars=["klass", "col"], var_name="attribute", value_name="value"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: list(range(5)) * 2,
|
||||
1: pd.concat([col] * 2, ignore_index=True),
|
||||
2: ["attr1"] * 5 + ["attr2"] * 5,
|
||||
3: expected_value,
|
||||
}
|
||||
)
|
||||
expected.columns = ["klass", "col", "attribute", "value"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_melt_missing_columns_raises(self):
|
||||
# GH-23575
|
||||
# This test is to ensure that pandas raises an error if melting is
|
||||
# attempted with column names absent from the dataframe
|
||||
|
||||
# Generate data
|
||||
df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd"))
|
||||
|
||||
# Try to melt with missing `value_vars` column name
|
||||
msg = "The following '{Var}' are not present in the DataFrame: {Col}"
|
||||
with pytest.raises(
|
||||
KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]")
|
||||
):
|
||||
df.melt(["a", "b"], ["C", "d"])
|
||||
|
||||
# Try to melt with missing `id_vars` column name
|
||||
with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")):
|
||||
df.melt(["A", "b"], ["c", "d"])
|
||||
|
||||
# Multiple missing
|
||||
with pytest.raises(
|
||||
KeyError,
|
||||
match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"),
|
||||
):
|
||||
df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])
|
||||
|
||||
# Multiindex melt fails if column is missing from multilevel melt
|
||||
multi = df.copy()
|
||||
multi.columns = [list("ABCD"), list("abcd")]
|
||||
with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")):
|
||||
multi.melt([("E", "a")], [("B", "b")])
|
||||
# Multiindex fails if column is missing from single level melt
|
||||
with pytest.raises(
|
||||
KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]")
|
||||
):
|
||||
multi.melt(["A"], ["F"], col_level=0)
|
||||
|
||||
|
||||
class TestLreshape:
|
||||
def test_pairs(self):
|
||||
data = {
|
||||
"birthdt": [
|
||||
"08jan2009",
|
||||
"20dec2008",
|
||||
"30dec2008",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
],
|
||||
"birthwt": [1766, 3301, 1454, 3139, 4133],
|
||||
"id": [101, 102, 103, 104, 105],
|
||||
"sex": ["Male", "Female", "Female", "Female", "Female"],
|
||||
"visitdt1": [
|
||||
"11jan2009",
|
||||
"22dec2008",
|
||||
"04jan2009",
|
||||
"29dec2008",
|
||||
"20jan2009",
|
||||
],
|
||||
"visitdt2": ["21jan2009", nan, "22jan2009", "31dec2008", "03feb2009"],
|
||||
"visitdt3": ["05feb2009", nan, nan, "02jan2009", "15feb2009"],
|
||||
"wt1": [1823, 3338, 1549, 3298, 4306],
|
||||
"wt2": [2011.0, nan, 1892.0, 3338.0, 4575.0],
|
||||
"wt3": [2293.0, nan, nan, 3377.0, 4805.0],
|
||||
}
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
spec = {
|
||||
"visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 4)],
|
||||
"wt": ["wt{i:d}".format(i=i) for i in range(1, 4)],
|
||||
}
|
||||
result = lreshape(df, spec)
|
||||
|
||||
exp_data = {
|
||||
"birthdt": [
|
||||
"08jan2009",
|
||||
"20dec2008",
|
||||
"30dec2008",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
"08jan2009",
|
||||
"30dec2008",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
"08jan2009",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
],
|
||||
"birthwt": [
|
||||
1766,
|
||||
3301,
|
||||
1454,
|
||||
3139,
|
||||
4133,
|
||||
1766,
|
||||
1454,
|
||||
3139,
|
||||
4133,
|
||||
1766,
|
||||
3139,
|
||||
4133,
|
||||
],
|
||||
"id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
|
||||
"sex": [
|
||||
"Male",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Male",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Male",
|
||||
"Female",
|
||||
"Female",
|
||||
],
|
||||
"visitdt": [
|
||||
"11jan2009",
|
||||
"22dec2008",
|
||||
"04jan2009",
|
||||
"29dec2008",
|
||||
"20jan2009",
|
||||
"21jan2009",
|
||||
"22jan2009",
|
||||
"31dec2008",
|
||||
"03feb2009",
|
||||
"05feb2009",
|
||||
"02jan2009",
|
||||
"15feb2009",
|
||||
],
|
||||
"wt": [
|
||||
1823.0,
|
||||
3338.0,
|
||||
1549.0,
|
||||
3298.0,
|
||||
4306.0,
|
||||
2011.0,
|
||||
1892.0,
|
||||
3338.0,
|
||||
4575.0,
|
||||
2293.0,
|
||||
3377.0,
|
||||
4805.0,
|
||||
],
|
||||
}
|
||||
exp = DataFrame(exp_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = lreshape(df, spec, dropna=False)
|
||||
exp_data = {
|
||||
"birthdt": [
|
||||
"08jan2009",
|
||||
"20dec2008",
|
||||
"30dec2008",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
"08jan2009",
|
||||
"20dec2008",
|
||||
"30dec2008",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
"08jan2009",
|
||||
"20dec2008",
|
||||
"30dec2008",
|
||||
"21dec2008",
|
||||
"11jan2009",
|
||||
],
|
||||
"birthwt": [
|
||||
1766,
|
||||
3301,
|
||||
1454,
|
||||
3139,
|
||||
4133,
|
||||
1766,
|
||||
3301,
|
||||
1454,
|
||||
3139,
|
||||
4133,
|
||||
1766,
|
||||
3301,
|
||||
1454,
|
||||
3139,
|
||||
4133,
|
||||
],
|
||||
"id": [
|
||||
101,
|
||||
102,
|
||||
103,
|
||||
104,
|
||||
105,
|
||||
101,
|
||||
102,
|
||||
103,
|
||||
104,
|
||||
105,
|
||||
101,
|
||||
102,
|
||||
103,
|
||||
104,
|
||||
105,
|
||||
],
|
||||
"sex": [
|
||||
"Male",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Male",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Male",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
"Female",
|
||||
],
|
||||
"visitdt": [
|
||||
"11jan2009",
|
||||
"22dec2008",
|
||||
"04jan2009",
|
||||
"29dec2008",
|
||||
"20jan2009",
|
||||
"21jan2009",
|
||||
nan,
|
||||
"22jan2009",
|
||||
"31dec2008",
|
||||
"03feb2009",
|
||||
"05feb2009",
|
||||
nan,
|
||||
nan,
|
||||
"02jan2009",
|
||||
"15feb2009",
|
||||
],
|
||||
"wt": [
|
||||
1823.0,
|
||||
3338.0,
|
||||
1549.0,
|
||||
3298.0,
|
||||
4306.0,
|
||||
2011.0,
|
||||
nan,
|
||||
1892.0,
|
||||
3338.0,
|
||||
4575.0,
|
||||
2293.0,
|
||||
nan,
|
||||
nan,
|
||||
3377.0,
|
||||
4805.0,
|
||||
],
|
||||
}
|
||||
exp = DataFrame(exp_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
spec = {
|
||||
"visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 3)],
|
||||
"wt": ["wt{i:d}".format(i=i) for i in range(1, 4)],
|
||||
}
|
||||
msg = "All column lists must be same length"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
lreshape(df, spec)
|
||||
|
||||
|
||||
class TestWideToLong:
|
||||
def test_simple(self):
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A1970": {0: "a", 1: "b", 2: "c"},
|
||||
"A1980": {0: "d", 1: "e", 2: "f"},
|
||||
"B1970": {0: 2.5, 1: 1.2, 2: 0.7},
|
||||
"B1980": {0: 3.2, 1: 1.3, 2: 0.1},
|
||||
"X": dict(zip(range(3), x)),
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": x.tolist() + x.tolist(),
|
||||
"A": ["a", "b", "c", "d", "e", "f"],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2],
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_stubs(self):
|
||||
# GH9204
|
||||
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
|
||||
df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
|
||||
stubs = ["inc", "edu"]
|
||||
|
||||
# TODO: unused?
|
||||
df_long = pd.wide_to_long(df, stubs, i="id", j="age") # noqa
|
||||
|
||||
assert stubs == ["inc", "edu"]
|
||||
|
||||
def test_separating_character(self):
|
||||
# GH14779
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A.1970": {0: "a", 1: "b", 2: "c"},
|
||||
"A.1980": {0: "d", 1: "e", 2: "f"},
|
||||
"B.1970": {0: 2.5, 1: 1.2, 2: 0.7},
|
||||
"B.1980": {0: 3.2, 1: 1.3, 2: 0.1},
|
||||
"X": dict(zip(range(3), x)),
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": x.tolist() + x.tolist(),
|
||||
"A": ["a", "b", "c", "d", "e", "f"],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2],
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_escapable_characters(self):
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A(quarterly)1970": {0: "a", 1: "b", 2: "c"},
|
||||
"A(quarterly)1980": {0: "d", 1: "e", 2: "f"},
|
||||
"B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7},
|
||||
"B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1},
|
||||
"X": dict(zip(range(3), x)),
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": x.tolist() + x.tolist(),
|
||||
"A(quarterly)": ["a", "b", "c", "d", "e", "f"],
|
||||
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2],
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
expected = expected.set_index(["id", "year"])[
|
||||
["X", "A(quarterly)", "B(quarterly)"]
|
||||
]
|
||||
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_unbalanced(self):
|
||||
# test that we can have a varying amount of time variables
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A2010": [1.0, 2.0],
|
||||
"A2011": [3.0, 4.0],
|
||||
"B2010": [5.0, 6.0],
|
||||
"X": ["X1", "X2"],
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": ["X1", "X1", "X2", "X2"],
|
||||
"A": [1.0, 3.0, 2.0, 4.0],
|
||||
"B": [5.0, np.nan, 6.0, np.nan],
|
||||
"id": [0, 0, 1, 1],
|
||||
"year": [2010, 2011, 2010, 2011],
|
||||
}
|
||||
expected = pd.DataFrame(exp_data)
|
||||
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_character_overlap(self):
|
||||
# Test we handle overlapping characters in both id_vars and value_vars
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A11": ["a11", "a22", "a33"],
|
||||
"A12": ["a21", "a22", "a23"],
|
||||
"B11": ["b11", "b12", "b13"],
|
||||
"B12": ["b21", "b22", "b23"],
|
||||
"BB11": [1, 2, 3],
|
||||
"BB12": [4, 5, 6],
|
||||
"BBBX": [91, 92, 93],
|
||||
"BBBZ": [91, 92, 93],
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"BBBX": [91, 92, 93, 91, 92, 93],
|
||||
"BBBZ": [91, 92, 93, 91, 92, 93],
|
||||
"A": ["a11", "a22", "a33", "a21", "a22", "a23"],
|
||||
"B": ["b11", "b12", "b13", "b21", "b22", "b23"],
|
||||
"BB": [1, 2, 3, 4, 5, 6],
|
||||
"id": [0, 1, 2, 0, 1, 2],
|
||||
"year": [11, 11, 11, 12, 12, 12],
|
||||
}
|
||||
)
|
||||
expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]]
|
||||
result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
|
||||
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
|
||||
|
||||
def test_invalid_separator(self):
|
||||
# if an invalid separator is supplied a empty data frame is returned
|
||||
sep = "nope!"
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A2010": [1.0, 2.0],
|
||||
"A2011": [3.0, 4.0],
|
||||
"B2010": [5.0, 6.0],
|
||||
"X": ["X1", "X2"],
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": "",
|
||||
"A2010": [],
|
||||
"A2011": [],
|
||||
"B2010": [],
|
||||
"id": [],
|
||||
"year": [],
|
||||
"A": [],
|
||||
"B": [],
|
||||
}
|
||||
expected = pd.DataFrame(exp_data).astype({"year": "int"})
|
||||
expected = expected.set_index(["id", "year"])[
|
||||
["X", "A2010", "A2011", "B2010", "A", "B"]
|
||||
]
|
||||
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
|
||||
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
|
||||
|
||||
def test_num_string_disambiguation(self):
|
||||
# Test that we can disambiguate number value_vars from
|
||||
# string value_vars
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A11": ["a11", "a22", "a33"],
|
||||
"A12": ["a21", "a22", "a23"],
|
||||
"B11": ["b11", "b12", "b13"],
|
||||
"B12": ["b21", "b22", "b23"],
|
||||
"BB11": [1, 2, 3],
|
||||
"BB12": [4, 5, 6],
|
||||
"Arating": [91, 92, 93],
|
||||
"Arating_old": [91, 92, 93],
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"Arating": [91, 92, 93, 91, 92, 93],
|
||||
"Arating_old": [91, 92, 93, 91, 92, 93],
|
||||
"A": ["a11", "a22", "a33", "a21", "a22", "a23"],
|
||||
"B": ["b11", "b12", "b13", "b21", "b22", "b23"],
|
||||
"BB": [1, 2, 3, 4, 5, 6],
|
||||
"id": [0, 1, 2, 0, 1, 2],
|
||||
"year": [11, 11, 11, 12, 12, 12],
|
||||
}
|
||||
)
|
||||
expected = expected.set_index(["id", "year"])[
|
||||
["Arating", "Arating_old", "A", "B", "BB"]
|
||||
]
|
||||
result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
|
||||
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
|
||||
|
||||
def test_invalid_suffixtype(self):
|
||||
# If all stubs names end with a string, but a numeric suffix is
|
||||
# assumed, an empty data frame is returned
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"Aone": [1.0, 2.0],
|
||||
"Atwo": [3.0, 4.0],
|
||||
"Bone": [5.0, 6.0],
|
||||
"X": ["X1", "X2"],
|
||||
}
|
||||
)
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": "",
|
||||
"Aone": [],
|
||||
"Atwo": [],
|
||||
"Bone": [],
|
||||
"id": [],
|
||||
"year": [],
|
||||
"A": [],
|
||||
"B": [],
|
||||
}
|
||||
expected = pd.DataFrame(exp_data).astype({"year": "int"})
|
||||
|
||||
expected = expected.set_index(["id", "year"])
|
||||
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||||
result = wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
|
||||
|
||||
def test_multiple_id_columns(self):
|
||||
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"famid": [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
"birth": [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||||
"ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||||
"ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9],
|
||||
}
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"ht": [
|
||||
2.8,
|
||||
3.4,
|
||||
2.9,
|
||||
3.8,
|
||||
2.2,
|
||||
2.9,
|
||||
2.0,
|
||||
3.2,
|
||||
1.8,
|
||||
2.8,
|
||||
1.9,
|
||||
2.4,
|
||||
2.2,
|
||||
3.3,
|
||||
2.3,
|
||||
3.4,
|
||||
2.1,
|
||||
2.9,
|
||||
],
|
||||
"famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
|
||||
"birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
|
||||
"age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
|
||||
}
|
||||
)
|
||||
expected = expected.set_index(["famid", "birth", "age"])[["ht"]]
|
||||
result = wide_to_long(df, "ht", i=["famid", "birth"], j="age")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_unique_idvars(self):
|
||||
# GH16382
|
||||
# Raise an error message if non unique id vars (i) are passed
|
||||
df = pd.DataFrame(
|
||||
{"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]}
|
||||
)
|
||||
msg = "the id variables need to uniquely identify each row"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
wide_to_long(df, ["A_A", "B_B"], i="x", j="colname")
|
||||
|
||||
def test_cast_j_int(self):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"],
|
||||
"actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"],
|
||||
"actor_fb_likes_1": [1000.0, 40000.0, 11000.0],
|
||||
"actor_fb_likes_2": [936.0, 5000.0, 393.0],
|
||||
"title": ["Avatar", "Pirates of the Caribbean", "Spectre"],
|
||||
}
|
||||
)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"actor": [
|
||||
"CCH Pounder",
|
||||
"Johnny Depp",
|
||||
"Christoph Waltz",
|
||||
"Joel David Moore",
|
||||
"Orlando Bloom",
|
||||
"Rory Kinnear",
|
||||
],
|
||||
"actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
|
||||
"num": [1, 1, 1, 2, 2, 2],
|
||||
"title": [
|
||||
"Avatar",
|
||||
"Pirates of the Caribbean",
|
||||
"Spectre",
|
||||
"Avatar",
|
||||
"Pirates of the Caribbean",
|
||||
"Spectre",
|
||||
],
|
||||
}
|
||||
).set_index(["title", "num"])
|
||||
result = wide_to_long(
|
||||
df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_"
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_identical_stubnames(self):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A2010": [1.0, 2.0],
|
||||
"A2011": [3.0, 4.0],
|
||||
"B2010": [5.0, 6.0],
|
||||
"A": ["X1", "X2"],
|
||||
}
|
||||
)
|
||||
msg = "stubname can't be identical to a column name"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
wide_to_long(df, ["A", "B"], i="A", j="colname")
|
||||
|
||||
def test_nonnumeric_suffix(self):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"treatment_placebo": [1.0, 2.0],
|
||||
"treatment_test": [3.0, 4.0],
|
||||
"result_placebo": [5.0, 6.0],
|
||||
"A": ["X1", "X2"],
|
||||
}
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": ["X1", "X1", "X2", "X2"],
|
||||
"colname": ["placebo", "test", "placebo", "test"],
|
||||
"result": [5.0, np.nan, 6.0, np.nan],
|
||||
"treatment": [1.0, 3.0, 2.0, 4.0],
|
||||
}
|
||||
)
|
||||
expected = expected.set_index(["A", "colname"])
|
||||
result = wide_to_long(
|
||||
df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_type_suffix(self):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": ["X1", "X2"],
|
||||
"result_1": [0, 9],
|
||||
"result_foo": [5.0, 6.0],
|
||||
"treatment_1": [1.0, 2.0],
|
||||
"treatment_foo": [3.0, 4.0],
|
||||
}
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": ["X1", "X2", "X1", "X2"],
|
||||
"colname": ["1", "1", "foo", "foo"],
|
||||
"result": [0.0, 9.0, 5.0, 6.0],
|
||||
"treatment": [1.0, 2.0, 3.0, 4.0],
|
||||
}
|
||||
).set_index(["A", "colname"])
|
||||
result = wide_to_long(
|
||||
df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_float_suffix(self):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"treatment_1.1": [1.0, 2.0],
|
||||
"treatment_2.1": [3.0, 4.0],
|
||||
"result_1.2": [5.0, 6.0],
|
||||
"result_1": [0, 9],
|
||||
"A": ["X1", "X2"],
|
||||
}
|
||||
)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"],
|
||||
"colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
|
||||
"result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
|
||||
"treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0],
|
||||
}
|
||||
)
|
||||
expected = expected.set_index(["A", "colname"])
|
||||
result = wide_to_long(
|
||||
df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_col_substring_of_stubname(self):
|
||||
# GH22468
|
||||
# Don't raise ValueError when a column name is a substring
|
||||
# of a stubname that's been passed as a string
|
||||
wide_data = {
|
||||
"node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
|
||||
"A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
|
||||
"PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
|
||||
"PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
|
||||
"PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67},
|
||||
}
|
||||
wide_df = pd.DataFrame.from_dict(wide_data)
|
||||
expected = pd.wide_to_long(
|
||||
wide_df, stubnames=["PA"], i=["node_id", "A"], j="time"
|
||||
)
|
||||
result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
2449
venv/lib/python3.6/site-packages/pandas/tests/reshape/test_pivot.py
Normal file
2449
venv/lib/python3.6/site-packages/pandas/tests/reshape/test_pivot.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,238 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DatetimeIndex,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
NaT,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
cut,
|
||||
date_range,
|
||||
isna,
|
||||
qcut,
|
||||
timedelta_range,
|
||||
)
|
||||
from pandas.api.types import CategoricalDtype as CDT
|
||||
from pandas.core.algorithms import quantile
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.tseries.offsets import Day, Nano
|
||||
|
||||
|
||||
def test_qcut():
|
||||
arr = np.random.randn(1000)
|
||||
|
||||
# We store the bins as Index that have been
|
||||
# rounded to comparisons are a bit tricky.
|
||||
labels, bins = qcut(arr, 4, retbins=True)
|
||||
ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||||
|
||||
result = labels.categories.left.values
|
||||
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
|
||||
|
||||
result = labels.categories.right.values
|
||||
assert np.allclose(result, ex_bins[1:], atol=1e-2)
|
||||
|
||||
ex_levels = cut(arr, ex_bins, include_lowest=True)
|
||||
tm.assert_categorical_equal(labels, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_bounds():
|
||||
arr = np.random.randn(1000)
|
||||
|
||||
factor = qcut(arr, 10, labels=False)
|
||||
assert len(np.unique(factor)) == 10
|
||||
|
||||
|
||||
def test_qcut_specify_quantiles():
|
||||
arr = np.random.randn(100)
|
||||
factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
|
||||
|
||||
expected = qcut(arr, 4)
|
||||
tm.assert_categorical_equal(factor, expected)
|
||||
|
||||
|
||||
def test_qcut_all_bins_same():
|
||||
with pytest.raises(ValueError, match="edges.*unique"):
|
||||
qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
|
||||
|
||||
|
||||
def test_qcut_include_lowest():
|
||||
values = np.arange(10)
|
||||
ii = qcut(values, 4)
|
||||
|
||||
ex_levels = IntervalIndex(
|
||||
[
|
||||
Interval(-0.001, 2.25),
|
||||
Interval(2.25, 4.5),
|
||||
Interval(4.5, 6.75),
|
||||
Interval(6.75, 9),
|
||||
]
|
||||
)
|
||||
tm.assert_index_equal(ii.categories, ex_levels)
|
||||
|
||||
|
||||
def test_qcut_nas():
|
||||
arr = np.random.randn(100)
|
||||
arr[:20] = np.nan
|
||||
|
||||
result = qcut(arr, 4)
|
||||
assert isna(result[:20]).all()
|
||||
|
||||
|
||||
def test_qcut_index():
|
||||
result = qcut([0, 2], 2)
|
||||
intervals = [Interval(-0.001, 1), Interval(1, 2)]
|
||||
|
||||
expected = Categorical(intervals, ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
|
||||
def test_qcut_binning_issues(datapath):
|
||||
# see gh-1978, gh-1979
|
||||
cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
|
||||
arr = np.loadtxt(cut_file)
|
||||
result = qcut(arr, 20)
|
||||
|
||||
starts = []
|
||||
ends = []
|
||||
|
||||
for lev in np.unique(result):
|
||||
s = lev.left
|
||||
e = lev.right
|
||||
assert s != e
|
||||
|
||||
starts.append(float(s))
|
||||
ends.append(float(e))
|
||||
|
||||
for (sp, sn), (ep, en) in zip(
|
||||
zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
|
||||
):
|
||||
assert sp < sn
|
||||
assert ep < en
|
||||
assert ep <= sn
|
||||
|
||||
|
||||
def test_qcut_return_intervals():
|
||||
ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||||
res = qcut(ser, [0, 0.333, 0.666, 1])
|
||||
|
||||
exp_levels = np.array(
|
||||
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
|
||||
)
|
||||
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(dict(duplicates="drop"), None),
|
||||
(dict(), "Bin edges must be unique"),
|
||||
(dict(duplicates="raise"), "Bin edges must be unique"),
|
||||
(dict(duplicates="foo"), "invalid value for 'duplicates' parameter"),
|
||||
],
|
||||
)
|
||||
def test_qcut_duplicates_bin(kwargs, msg):
|
||||
# see gh-7751
|
||||
values = [0, 0, 0, 0, 1, 2, 3]
|
||||
|
||||
if msg is not None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
qcut(values, 3, **kwargs)
|
||||
else:
|
||||
result = qcut(values, 3, **kwargs)
|
||||
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
|
||||
tm.assert_index_equal(result.categories, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
|
||||
)
|
||||
@pytest.mark.parametrize("length", [1, 2])
|
||||
@pytest.mark.parametrize("labels", [None, False])
|
||||
def test_single_quantile(data, start, end, length, labels):
|
||||
# see gh-15431
|
||||
ser = Series([data] * length)
|
||||
result = qcut(ser, 1, labels=labels)
|
||||
|
||||
if labels is None:
|
||||
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
|
||||
expected = Series(intervals).astype(CDT(ordered=True))
|
||||
else:
|
||||
expected = Series([0] * length)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ser",
|
||||
[
|
||||
Series(DatetimeIndex(["20180101", NaT, "20180103"])),
|
||||
Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
|
||||
],
|
||||
ids=lambda x: str(x.dtype),
|
||||
)
|
||||
def test_qcut_nat(ser):
|
||||
# see gh-19768
|
||||
intervals = IntervalIndex.from_tuples(
|
||||
[(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
|
||||
)
|
||||
expected = Series(Categorical(intervals, ordered=True))
|
||||
|
||||
result = qcut(ser, 2)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
|
||||
def test_datetime_tz_qcut(bins):
|
||||
# see gh-19872
|
||||
tz = "US/Eastern"
|
||||
ser = Series(date_range("20130101", periods=3, tz=tz))
|
||||
|
||||
result = qcut(ser, bins)
|
||||
expected = Series(
|
||||
IntervalIndex(
|
||||
[
|
||||
Interval(
|
||||
Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-01 16:00:00", tz=tz),
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
),
|
||||
Interval(
|
||||
Timestamp("2013-01-02 08:00:00", tz=tz),
|
||||
Timestamp("2013-01-03 00:00:00", tz=tz),
|
||||
),
|
||||
]
|
||||
)
|
||||
).astype(CDT(ordered=True))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg,expected_bins",
|
||||
[
|
||||
[
|
||||
timedelta_range("1day", periods=3),
|
||||
TimedeltaIndex(["1 days", "2 days", "3 days"]),
|
||||
],
|
||||
[
|
||||
date_range("20180101", periods=3),
|
||||
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_date_like_qcut_bins(arg, expected_bins):
|
||||
# see gh-19891
|
||||
ser = Series(arg)
|
||||
result, result_bins = qcut(ser, 2, retbins=True)
|
||||
tm.assert_index_equal(result_bins, expected_bins)
|
||||
@@ -0,0 +1,653 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, DataFrame, Index, Series, get_dummies
|
||||
from pandas.core.sparse.api import SparseArray, SparseDtype
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
|
||||
class TestGetDummies:
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
|
||||
|
||||
@pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
|
||||
def dtype(self, request):
|
||||
return np.dtype(request.param)
|
||||
|
||||
@pytest.fixture(params=["dense", "sparse"])
|
||||
def sparse(self, request):
|
||||
# params are strings to simplify reading test results,
|
||||
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
|
||||
return request.param == "sparse"
|
||||
|
||||
def effective_dtype(self, dtype):
|
||||
if dtype is None:
|
||||
return np.uint8
|
||||
return dtype
|
||||
|
||||
def test_raises_on_dtype_object(self, df):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, dtype="object")
|
||||
|
||||
def test_basic(self, sparse, dtype):
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
)
|
||||
if sparse:
|
||||
expected = expected.apply(pd.SparseArray, fill_value=0.0)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list("ABC")
|
||||
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_types(self, sparse, dtype):
|
||||
# GH 10531
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_df = DataFrame(
|
||||
{"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
columns=list("abc"),
|
||||
)
|
||||
if sparse:
|
||||
if is_integer_dtype(dtype):
|
||||
fill_value = 0
|
||||
elif dtype == bool:
|
||||
fill_value = False
|
||||
else:
|
||||
fill_value = 0.0
|
||||
|
||||
expected = expected.apply(SparseArray, fill_value=fill_value)
|
||||
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
dtype_name = "Sparse[{}, {}]".format(
|
||||
self.effective_dtype(dtype).name, fill_value
|
||||
)
|
||||
else:
|
||||
dtype_name = self.effective_dtype(dtype).name
|
||||
|
||||
expected = Series({dtype_name: 8})
|
||||
result = result.dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
|
||||
|
||||
expected_counts = {"int64": 1, "object": 1}
|
||||
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
|
||||
|
||||
expected = Series(expected_counts).sort_index()
|
||||
result = result.dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
result = result.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_just_na(self, sparse):
|
||||
just_na_list = [np.nan]
|
||||
just_na_series = Series(just_na_list)
|
||||
just_na_series_index = Series(just_na_list, index=["A"])
|
||||
|
||||
res_list = get_dummies(just_na_list, sparse=sparse)
|
||||
res_series = get_dummies(just_na_series, sparse=sparse)
|
||||
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
|
||||
|
||||
assert res_list.empty
|
||||
assert res_series.empty
|
||||
assert res_series_index.empty
|
||||
|
||||
assert res_list.index.tolist() == [0]
|
||||
assert res_series.index.tolist() == [0]
|
||||
assert res_series_index.index.tolist() == ["A"]
|
||||
|
||||
def test_include_na(self, sparse, dtype):
|
||||
s = ["a", "b", np.nan]
|
||||
res = get_dummies(s, sparse=sparse, dtype=dtype)
|
||||
exp = DataFrame(
|
||||
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
|
||||
)
|
||||
if sparse:
|
||||
exp = exp.apply(pd.SparseArray, fill_value=0.0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
# Sparse dataframes do not allow nan labelled columns, see #GH8822
|
||||
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_na = DataFrame(
|
||||
{nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
|
||||
dtype=self.effective_dtype(dtype),
|
||||
)
|
||||
exp_na = exp_na.reindex(["a", "b", nan], axis=1)
|
||||
# hack (NaN handling in assert_index_equal)
|
||||
exp_na.columns = res_na.columns
|
||||
if sparse:
|
||||
exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
|
||||
assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype)
|
||||
exp_just_na = DataFrame(
|
||||
Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)
|
||||
)
|
||||
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
|
||||
|
||||
def test_unicode(self, sparse):
|
||||
# See GH 6885 - get_dummies chokes on unicode values
|
||||
import unicodedata
|
||||
|
||||
e = "e"
|
||||
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
|
||||
s = [e, eacute, eacute]
|
||||
res = get_dummies(s, prefix="letter", sparse=sparse)
|
||||
exp = DataFrame(
|
||||
{"letter_e": [1, 0, 0], "letter_{eacute}".format(eacute=eacute): [0, 1, 1]},
|
||||
dtype=np.uint8,
|
||||
)
|
||||
if sparse:
|
||||
exp = exp.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_dataframe_dummies_all_obj(self, df, sparse):
|
||||
df = df[["A", "B"]]
|
||||
result = get_dummies(df, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
|
||||
dtype=np.uint8,
|
||||
)
|
||||
if sparse:
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A_a": pd.SparseArray([1, 0, 1], dtype="uint8"),
|
||||
"A_b": pd.SparseArray([0, 1, 0], dtype="uint8"),
|
||||
"B_b": pd.SparseArray([1, 1, 0], dtype="uint8"),
|
||||
"B_c": pd.SparseArray([0, 0, 1], dtype="uint8"),
|
||||
}
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A_a": arr([1, 0, 1], dtype=typ),
|
||||
"A_b": arr([0, 1, 0], dtype=typ),
|
||||
"B_b": arr([1, 1, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1], dtype=typ),
|
||||
}
|
||||
)
|
||||
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_list(self, df, sparse):
|
||||
prefixes = ["from_A", "from_B"]
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
"from_B_b": [1, 1, 0],
|
||||
"from_B_c": [0, 0, 1],
|
||||
},
|
||||
dtype=np.uint8,
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
|
||||
expected = expected[["C"] + cols]
|
||||
|
||||
typ = pd.SparseArray if sparse else pd.Series
|
||||
expected[cols] = expected[cols].apply(lambda x: typ(x))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_str(self, df, sparse):
|
||||
# not that you should do this...
|
||||
result = get_dummies(df, prefix="bad", sparse=sparse)
|
||||
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
|
||||
expected = DataFrame(
|
||||
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
|
||||
columns=["C"] + bad_columns,
|
||||
dtype=np.uint8,
|
||||
)
|
||||
expected = expected.astype({"C": np.int64})
|
||||
if sparse:
|
||||
# work around astyping & assigning with duplicate columns
|
||||
# https://github.com/pandas-dev/pandas/issues/14427
|
||||
expected = pd.concat(
|
||||
[
|
||||
pd.Series([1, 2, 3], name="C"),
|
||||
pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"),
|
||||
pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
|
||||
pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
|
||||
pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_subset(self, df, sparse):
|
||||
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": ["b", "b", "c"],
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
},
|
||||
dtype=np.uint8,
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
if sparse:
|
||||
cols = ["from_A_a", "from_A_b"]
|
||||
expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep(self, df, sparse):
|
||||
result = get_dummies(df, prefix_sep="..", sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A..a": [1, 0, 1],
|
||||
"A..b": [0, 1, 0],
|
||||
"B..b": [1, 1, 0],
|
||||
"B..c": [0, 0, 1],
|
||||
},
|
||||
dtype=np.uint8,
|
||||
)
|
||||
expected[["C"]] = df[["C"]]
|
||||
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
|
||||
if sparse:
|
||||
cols = ["A..a", "A..b", "B..b", "B..c"]
|
||||
expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
|
||||
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, prefix=["too few"], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
|
||||
with pytest.raises(ValueError):
|
||||
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
|
||||
|
||||
def test_dataframe_dummies_prefix_dict(self, sparse):
|
||||
prefixes = {"A": "from_A", "B": "from_B"}
|
||||
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
|
||||
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"from_A_a": [1, 0, 1],
|
||||
"from_A_b": [0, 1, 0],
|
||||
"from_B_b": [1, 1, 0],
|
||||
"from_B_c": [0, 0, 1],
|
||||
}
|
||||
)
|
||||
|
||||
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
|
||||
expected[columns] = expected[columns].astype(np.uint8)
|
||||
if sparse:
|
||||
expected[columns] = expected[columns].apply(lambda x: pd.SparseSeries(x))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
|
||||
axis=1
|
||||
)
|
||||
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3, np.nan],
|
||||
"A_a": arr([1, 0, 1, 0], dtype=typ),
|
||||
"A_b": arr([0, 1, 0, 0], dtype=typ),
|
||||
"A_nan": arr([0, 0, 0, 1], dtype=typ),
|
||||
"B_b": arr([1, 1, 0, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1, 0], dtype=typ),
|
||||
"B_nan": arr([0, 0, 0, 1], dtype=typ),
|
||||
}
|
||||
).sort_index(axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
|
||||
expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
|
||||
df["cat"] = pd.Categorical(["x", "y", "y"])
|
||||
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||||
if sparse:
|
||||
arr = SparseArray
|
||||
typ = SparseDtype(dtype, 0)
|
||||
else:
|
||||
arr = np.array
|
||||
typ = dtype
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3],
|
||||
"A_a": arr([1, 0, 1], dtype=typ),
|
||||
"A_b": arr([0, 1, 0], dtype=typ),
|
||||
"B_b": arr([1, 1, 0], dtype=typ),
|
||||
"B_c": arr([0, 0, 1], dtype=typ),
|
||||
"cat_x": arr([1, 0, 0], dtype=typ),
|
||||
"cat_y": arr([0, 1, 1], dtype=typ),
|
||||
}
|
||||
).sort_index(axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"get_dummies_kwargs,expected",
|
||||
[
|
||||
(
|
||||
{"data": pd.DataFrame(({"ä": ["a"]}))},
|
||||
pd.DataFrame({"ä_a": [1]}, dtype=np.uint8),
|
||||
),
|
||||
(
|
||||
{"data": pd.DataFrame({"x": ["ä"]})},
|
||||
pd.DataFrame({"x_ä": [1]}, dtype=np.uint8),
|
||||
),
|
||||
(
|
||||
{"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"},
|
||||
pd.DataFrame({"ä_a": [1]}, dtype=np.uint8),
|
||||
),
|
||||
(
|
||||
{"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
|
||||
pd.DataFrame({"xäa": [1]}, dtype=np.uint8),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
|
||||
# GH22084 pd.get_dummies incorrectly encodes unicode characters
|
||||
# in dataframe column names
|
||||
result = get_dummies(**get_dummies_kwargs)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first(self, sparse):
|
||||
# GH12402 Add a new parameter `drop_first` to avoid collinearity
|
||||
# Basic case
|
||||
s_list = list("abc")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
if sparse:
|
||||
expected = expected.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected.index = list("ABC")
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first_one_level(self, sparse):
|
||||
# Test the case that categorical variable only has one level.
|
||||
s_list = list("aaa")
|
||||
s_series = Series(s_list)
|
||||
s_series_index = Series(s_list, list("ABC"))
|
||||
|
||||
expected = DataFrame(index=np.arange(3))
|
||||
|
||||
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(index=list("ABC"))
|
||||
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_basic_drop_first_NA(self, sparse):
|
||||
# Test NA handling together with drop_first
|
||||
s_NA = ["a", "b", np.nan]
|
||||
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
|
||||
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
|
||||
if sparse:
|
||||
exp = exp.apply(pd.SparseArray, fill_value=0)
|
||||
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
|
||||
exp_na = DataFrame({"b": [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex(
|
||||
["b", nan], axis=1
|
||||
)
|
||||
if sparse:
|
||||
exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(res_na, exp_na)
|
||||
|
||||
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse)
|
||||
exp_just_na = DataFrame(index=np.arange(1))
|
||||
assert_frame_equal(res_just_na, exp_just_na)
|
||||
|
||||
def test_dataframe_dummies_drop_first(self, df, sparse):
|
||||
df = df[["A", "B"]]
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
|
||||
if sparse:
|
||||
expected = expected.apply(pd.SparseArray, fill_value=0)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
|
||||
df["cat"] = pd.Categorical(["x", "y", "y"])
|
||||
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||||
expected = DataFrame(
|
||||
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
|
||||
)
|
||||
cols = ["A_b", "B_c", "cat_y"]
|
||||
expected[cols] = expected[cols].astype(np.uint8)
|
||||
expected = expected[["C", "A_b", "B_c", "cat_y"]]
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = pd.SparseSeries(expected[col])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
|
||||
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||||
result = get_dummies(
|
||||
df, dummy_na=True, drop_first=True, sparse=sparse
|
||||
).sort_index(axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": [1, 2, 3, np.nan],
|
||||
"A_b": [0, 1, 0, 0],
|
||||
"A_nan": [0, 0, 0, 1],
|
||||
"B_c": [0, 0, 1, 0],
|
||||
"B_nan": [0, 0, 0, 1],
|
||||
}
|
||||
)
|
||||
cols = ["A_b", "A_nan", "B_c", "B_nan"]
|
||||
expected[cols] = expected[cols].astype(np.uint8)
|
||||
expected = expected.sort_index(axis=1)
|
||||
if sparse:
|
||||
for col in cols:
|
||||
expected[col] = pd.SparseSeries(expected[col])
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
|
||||
expected = expected[["C", "A_b", "B_c"]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_int_int(self):
|
||||
data = Series([1, 2, 1])
|
||||
result = pd.get_dummies(data)
|
||||
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = Series(pd.Categorical(["a", "b", "a"]))
|
||||
result = pd.get_dummies(data)
|
||||
expected = DataFrame(
|
||||
[[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_int_df(self, dtype):
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 1],
|
||||
"B": pd.Categorical(["a", "b", "a"]),
|
||||
"C": [1, 2, 1],
|
||||
"D": [1.0, 2.0, 1.0],
|
||||
}
|
||||
)
|
||||
columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
|
||||
expected = DataFrame(
|
||||
[[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
|
||||
columns=columns,
|
||||
)
|
||||
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
|
||||
result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
|
||||
result = get_dummies(cat, dtype=dtype)
|
||||
|
||||
data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
|
||||
cols = pd.CategoricalIndex(
|
||||
cat.categories, categories=cat.categories, ordered=ordered
|
||||
)
|
||||
expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("sparse", [True, False])
|
||||
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
|
||||
# GH18914
|
||||
df = DataFrame.from_dict(
|
||||
OrderedDict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])])
|
||||
)
|
||||
df = get_dummies(df, columns=["Nation"], sparse=sparse)
|
||||
df2 = df.reindex(columns=["GDP"])
|
||||
|
||||
tm.assert_frame_equal(df[["GDP"]], df2)
|
||||
|
||||
def test_get_dummies_duplicate_columns(self, df):
|
||||
# GH20839
|
||||
df.columns = ["A", "A", "A"]
|
||||
result = get_dummies(df).sort_index(axis=1)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
|
||||
columns=["A", "A_a", "A_b", "A_b", "A_c"],
|
||||
dtype=np.uint8,
|
||||
).sort_index(axis=1)
|
||||
|
||||
expected = expected.astype({"A": np.int64})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_dummies_all_sparse(self):
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
result = pd.get_dummies(df, columns=["A"], sparse=True)
|
||||
dtype = SparseDtype("uint8", 0)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A_1": SparseArray([1, 0], dtype=dtype),
|
||||
"A_2": SparseArray([0, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestCategoricalReshape:
|
||||
def test_reshaping_multi_index_categorical(self):
|
||||
|
||||
cols = ["ItemA", "ItemB", "ItemC"]
|
||||
data = {c: tm.makeTimeDataFrame() for c in cols}
|
||||
df = pd.concat({c: data[c].stack() for c in data}, axis="columns")
|
||||
df.index.names = ["major", "minor"]
|
||||
df["str"] = "foo"
|
||||
|
||||
dti = df.index.levels[0]
|
||||
|
||||
df["category"] = df["str"].astype("category")
|
||||
result = df["category"].unstack()
|
||||
|
||||
c = Categorical(["foo"] * len(dti))
|
||||
expected = DataFrame(
|
||||
{"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()},
|
||||
columns=Index(list("ABCD"), name="minor"),
|
||||
index=dti,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestMakeAxisDummies:
|
||||
def test_preserve_categorical_dtype(self):
|
||||
# GH13854
|
||||
for ordered in [False, True]:
|
||||
cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
|
||||
midx = pd.MultiIndex(levels=[["a"], cidx], codes=[[0, 0], [0, 1]])
|
||||
df = DataFrame([[10, 11]], index=midx)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], index=midx, columns=cidx
|
||||
)
|
||||
|
||||
from pandas.core.reshape.reshape import make_axis_dummies
|
||||
|
||||
result = make_axis_dummies(df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = make_axis_dummies(df, transform=lambda x: x)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,348 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.concat import union_categoricals
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, CategoricalIndex, Series
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
class TestUnionCategoricals:
|
||||
def test_union_categorical(self):
|
||||
# GH 13361
|
||||
data = [
|
||||
(list("abc"), list("abd"), list("abcabd")),
|
||||
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
|
||||
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
|
||||
(
|
||||
["b", "b", np.nan, "a"],
|
||||
["a", np.nan, "c"],
|
||||
["b", "b", np.nan, "a", "a", np.nan, "c"],
|
||||
),
|
||||
(
|
||||
pd.date_range("2014-01-01", "2014-01-05"),
|
||||
pd.date_range("2014-01-06", "2014-01-07"),
|
||||
pd.date_range("2014-01-01", "2014-01-07"),
|
||||
),
|
||||
(
|
||||
pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
|
||||
pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
|
||||
pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
|
||||
),
|
||||
(
|
||||
pd.period_range("2014-01-01", "2014-01-05"),
|
||||
pd.period_range("2014-01-06", "2014-01-07"),
|
||||
pd.period_range("2014-01-01", "2014-01-07"),
|
||||
),
|
||||
]
|
||||
|
||||
for a, b, combined in data:
|
||||
for box in [Categorical, CategoricalIndex, Series]:
|
||||
result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
|
||||
expected = Categorical(combined)
|
||||
tm.assert_categorical_equal(result, expected, check_category_order=True)
|
||||
|
||||
# new categories ordered by appearance
|
||||
s = Categorical(["x", "y", "z"])
|
||||
s2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
s = Categorical([0, 1.2, 2], ordered=True)
|
||||
s2 = Categorical([0, 1.2, 2], ordered=True)
|
||||
result = union_categoricals([s, s2])
|
||||
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# must exactly match types
|
||||
s = Categorical([0, 1.2, 2])
|
||||
s2 = Categorical([2, 3, 4])
|
||||
msg = "dtype of categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([s, s2])
|
||||
|
||||
msg = "No Categoricals to union"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
union_categoricals([])
|
||||
|
||||
def test_union_categoricals_nan(self):
|
||||
# GH 13759
|
||||
res = union_categoricals(
|
||||
[pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]
|
||||
)
|
||||
exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals(
|
||||
[pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])]
|
||||
)
|
||||
exp = Categorical(["A", "B", "B", "B", np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
|
||||
val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
|
||||
|
||||
res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)])
|
||||
exp = Categorical(
|
||||
val1 + val2,
|
||||
categories=[
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-03-01"),
|
||||
pd.Timestamp("2011-02-01"),
|
||||
],
|
||||
)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
# all NaN
|
||||
res = union_categoricals(
|
||||
[
|
||||
pd.Categorical(np.array([np.nan, np.nan], dtype=object)),
|
||||
pd.Categorical(["X"]),
|
||||
]
|
||||
)
|
||||
exp = Categorical([np.nan, np.nan, "X"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals(
|
||||
[pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]
|
||||
)
|
||||
exp = Categorical([np.nan, np.nan, np.nan, np.nan])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categoricals_empty(self):
|
||||
# GH 13759
|
||||
res = union_categoricals([pd.Categorical([]), pd.Categorical([])])
|
||||
exp = Categorical([])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([Categorical([]), Categorical(["1"])])
|
||||
exp = Categorical(["1"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_category(self):
|
||||
# check fastpath
|
||||
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
|
||||
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
|
||||
c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
def test_union_categorical_same_categories_different_order(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/19096
|
||||
c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(
|
||||
["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categoricals_ordered(self):
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
msg = "Categorical.ordered must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
res = union_categoricals([c1, c1])
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2])
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_ignore_order(self):
|
||||
# GH 15219
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], ordered=False)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
msg = "Categorical.ordered must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c1, c1], ignore_order=False)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
|
||||
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, np.nan, 3, 2])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
|
||||
|
||||
res = union_categoricals([c1, c2], ignore_order=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
|
||||
exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
|
||||
tm.assert_categorical_equal(res, exp)
|
||||
|
||||
c1 = Categorical([1, 2, 3], ordered=True)
|
||||
c2 = Categorical([4, 5, 6], ordered=True)
|
||||
result = union_categoricals([c1, c2], ignore_order=True)
|
||||
expected = Categorical([1, 2, 3, 4, 5, 6])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
msg = "to union ordered Categoricals, all categories must be the same"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2], ignore_order=False)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
union_categoricals([c1, c2])
|
||||
|
||||
def test_union_categoricals_sort(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(["x", "y", "z"])
|
||||
c2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
|
||||
c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["x", np.nan])
|
||||
c2 = Categorical([np.nan, "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=True)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
|
||||
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
|
||||
with pytest.raises(TypeError):
|
||||
union_categoricals([c1, c2], sort_categories=True)
|
||||
|
||||
def test_union_categoricals_sort_false(self):
|
||||
# GH 13846
|
||||
c1 = Categorical(["x", "y", "z"])
|
||||
c2 = Categorical(["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(
|
||||
["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath
|
||||
c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
# fastpath - skip resort
|
||||
c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
||||
c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["x", np.nan])
|
||||
c2 = Categorical([np.nan, "b"])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([np.nan])
|
||||
c2 = Categorical([np.nan])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([np.nan, np.nan])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical([])
|
||||
c2 = Categorical([])
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical([])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
|
||||
c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
|
||||
result = union_categoricals([c1, c2], sort_categories=False)
|
||||
expected = Categorical(
|
||||
["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
|
||||
)
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
def test_union_categorical_unwrap(self):
|
||||
# GH 14173
|
||||
c1 = Categorical(["a", "b"])
|
||||
c2 = pd.Series(["b", "c"], dtype="category")
|
||||
result = union_categoricals([c1, c2])
|
||||
expected = Categorical(["a", "b", "b", "c"])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c2 = CategoricalIndex(c2)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
c1 = Series(c1)
|
||||
result = union_categoricals([c1, c2])
|
||||
tm.assert_categorical_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
union_categoricals([c1, ["a", "b", "c"]])
|
||||
@@ -0,0 +1,51 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import Index, date_range
|
||||
from pandas.core.reshape.util import cartesian_product
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestCartesianProduct:
|
||||
def test_simple(self):
|
||||
x, y = list("ABC"), [1, 22]
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
expected1 = np.array(["A", "A", "B", "B", "C", "C"])
|
||||
expected2 = np.array([1, 22, 1, 22, 1, 22])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
def test_datetimeindex(self):
|
||||
# regression test for GitHub issue #6439
|
||||
# make sure that the ordering on datetimeindex is consistent
|
||||
x = date_range("2000-01-01", periods=2)
|
||||
result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
|
||||
expected1 = Index([1, 1, 2, 2])
|
||||
expected2 = Index([1, 2, 1, 2])
|
||||
tm.assert_index_equal(result1, expected1)
|
||||
tm.assert_index_equal(result2, expected2)
|
||||
|
||||
def test_empty(self):
|
||||
# product of empty factors
|
||||
X = [[], [0, 1], []]
|
||||
Y = [[], [], ["a", "b", "c"]]
|
||||
for x, y in zip(X, Y):
|
||||
expected1 = np.array([], dtype=np.asarray(x).dtype)
|
||||
expected2 = np.array([], dtype=np.asarray(y).dtype)
|
||||
result1, result2 = cartesian_product([x, y])
|
||||
tm.assert_numpy_array_equal(result1, expected1)
|
||||
tm.assert_numpy_array_equal(result2, expected2)
|
||||
|
||||
# empty product (empty input):
|
||||
result = cartesian_product([])
|
||||
expected = []
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]]
|
||||
)
|
||||
def test_invalid_input(self, X):
|
||||
msg = "Input must be a list-like of list-likes"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
cartesian_product(X=X)
|
||||
Reference in New Issue
Block a user