8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,862 @@
import numpy as np
from numpy.random import randn
import pytest
from pandas._libs import join as libjoin
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
a_ = np.array
class TestJoin:
def setup_method(self, method):
# aggregate multiple columns
self.df = DataFrame(
{
"key1": get_test_data(),
"key2": get_test_data(),
"data1": np.random.randn(N),
"data2": np.random.randn(N),
}
)
# exclude a couple keys for fun
self.df = self.df[self.df["key2"] > 1]
self.df2 = DataFrame(
{
"key1": get_test_data(n=N // 5),
"key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5),
"value": np.random.randn(N // 5),
}
)
index, data = tm.getMixedTypeDict()
self.target = DataFrame(data, index=index)
# Join on string value
self.source = DataFrame(
{"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
)
def test_cython_left_outer_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
ls, rs = libjoin.left_outer_join(left, right, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_cython_right_outer_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
rs, ls = libjoin.left_outer_join(right, left, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
# 0 1 1 1
exp_li = a_(
[
0,
1,
2,
3,
4,
5,
3,
4,
5,
3,
4,
5,
# 2 2 4
6,
7,
8,
6,
7,
8,
-1,
]
)
exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_cython_inner_join(self):
left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
max_group = 5
ls, rs = libjoin.inner_join(left, right, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
exp_ls = exp_ls.take(exp_li)
exp_ls[exp_li == -1] = -1
exp_rs = exp_rs.take(exp_ri)
exp_rs[exp_ri == -1] = -1
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
def test_left_outer_join(self):
joined_key2 = merge(self.df, self.df2, on="key2")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="left")
joined_both = merge(self.df, self.df2)
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left")
def test_right_outer_join(self):
joined_key2 = merge(self.df, self.df2, on="key2", how="right")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="right")
joined_both = merge(self.df, self.df2, how="right")
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right")
def test_full_outer_join(self):
joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")
joined_both = merge(self.df, self.df2, how="outer")
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer")
def test_inner_join(self):
joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
_check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")
joined_both = merge(self.df, self.df2, how="inner")
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")
def test_handle_overlap(self):
joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"])
assert "key1.foo" in joined
assert "key1.bar" in joined
def test_handle_overlap_arbitrary_key(self):
joined = merge(
self.df,
self.df2,
left_on="key2",
right_on="key1",
suffixes=[".foo", ".bar"],
)
assert "key1.foo" in joined
assert "key2.bar" in joined
def test_join_on(self):
target = self.target
source = self.source
merged = target.join(source, on="C")
tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)
# join with duplicates (fix regression from DataFrame/Matrix merge)
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
joined = df.join(df2, on="key")
expected = DataFrame(
{"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
)
assert_frame_equal(joined, expected)
# Test when some are missing
df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
joined = df_a.join(df_b, on="one")
joined = joined.join(df_c, on="one")
assert np.isnan(joined["two"]["c"])
assert np.isnan(joined["three"]["c"])
# merge column not p resent
with pytest.raises(KeyError, match="^'E'$"):
target.join(source, on="E")
# overlap
source_copy = source.copy()
source_copy["A"] = 0
msg = (
"You are trying to merge on float64 and object columns. If"
" you wish to proceed you should use pd.concat"
)
with pytest.raises(ValueError, match=msg):
target.join(source_copy, on="A")
def test_join_on_fails_with_different_right_index(self):
df = DataFrame(
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
)
df2 = DataFrame(
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
index=tm.makeCustomIndex(10, 2),
)
msg = (
r"len\(left_on\) must equal the number of levels in the index" ' of "right"'
)
with pytest.raises(ValueError, match=msg):
merge(df, df2, left_on="a", right_index=True)
def test_join_on_fails_with_different_left_index(self):
df = DataFrame(
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)},
index=tm.makeCustomIndex(3, 2),
)
df2 = DataFrame(
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}
)
msg = (
r"len\(right_on\) must equal the number of levels in the index" ' of "left"'
)
with pytest.raises(ValueError, match=msg):
merge(df, df2, right_on="b", left_index=True)
def test_join_on_fails_with_different_column_counts(self):
df = DataFrame(
{"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
)
df2 = DataFrame(
{"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
index=tm.makeCustomIndex(10, 2),
)
msg = r"len\(right_on\) must equal len\(left_on\)"
with pytest.raises(ValueError, match=msg):
merge(df, df2, right_on="a", left_on=["a", "b"])
@pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
def test_join_on_fails_with_wrong_object_type(self, wrong_type):
# GH12081 - original issue
# GH21220 - merging of Series and DataFrame is now allowed
# Edited test to remove the Series object from test parameters
df = DataFrame({"a": [1, 1]})
msg = "Can only merge Series or DataFrame objects, a {} was passed".format(
str(type(wrong_type))
)
with pytest.raises(TypeError, match=msg):
merge(wrong_type, df, left_on="a", right_on="a")
with pytest.raises(TypeError, match=msg):
merge(df, wrong_type, left_on="a", right_on="a")
def test_join_on_pass_vector(self):
expected = self.target.join(self.source, on="C")
del expected["C"]
join_col = self.target.pop("C")
result = self.target.join(self.source, on=join_col)
assert_frame_equal(result, expected)
def test_join_with_len0(self):
# nothing to merge
merged = self.target.join(self.source.reindex([]), on="C")
for col in self.source:
assert col in merged
assert merged[col].isna().all()
merged2 = self.target.join(self.source.reindex([]), on="C", how="inner")
tm.assert_index_equal(merged2.columns, merged.columns)
assert len(merged2) == 0
def test_join_on_inner(self):
df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])
joined = df.join(df2, on="key", how="inner")
expected = df.join(df2, on="key")
expected = expected[expected["value"].notna()]
tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False)
tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
tm.assert_index_equal(joined.index, expected.index)
def test_join_on_singlekey_list(self):
df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
# corner cases
joined = df.join(df2, on=["key"])
expected = df.join(df2, on="key")
assert_frame_equal(joined, expected)
def test_join_on_series(self):
result = self.target.join(self.source["MergedA"], on="C")
expected = self.target.join(self.source[["MergedA"]], on="C")
assert_frame_equal(result, expected)
def test_join_on_series_buglet(self):
# GH #638
df = DataFrame({"a": [1, 1]})
ds = Series([2], index=[1], name="b")
result = df.join(ds, on="a")
expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
tm.assert_frame_equal(result, expected)
def test_join_index_mixed(self, join_type):
# no overlapping blocks
df1 = DataFrame(index=np.arange(10))
df1["bool"] = True
df1["string"] = "foo"
df2 = DataFrame(index=np.arange(5, 15))
df2["int"] = 1
df2["float"] = 1.0
joined = df1.join(df2, how=join_type)
expected = _join_by_hand(df1, df2, how=join_type)
assert_frame_equal(joined, expected)
joined = df2.join(df1, how=join_type)
expected = _join_by_hand(df2, df1, how=join_type)
assert_frame_equal(joined, expected)
def test_join_index_mixed_overlap(self):
df1 = DataFrame(
{"A": 1.0, "B": 2, "C": "foo", "D": True},
index=np.arange(10),
columns=["A", "B", "C", "D"],
)
assert df1["B"].dtype == np.int64
assert df1["D"].dtype == np.bool_
df2 = DataFrame(
{"A": 1.0, "B": 2, "C": "foo", "D": True},
index=np.arange(0, 10, 2),
columns=["A", "B", "C", "D"],
)
# overlap
joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
expected_columns = [
"A_one",
"B_one",
"C_one",
"D_one",
"A_two",
"B_two",
"C_two",
"D_two",
]
df1.columns = expected_columns[:4]
df2.columns = expected_columns[4:]
expected = _join_by_hand(df1, df2)
assert_frame_equal(joined, expected)
def test_join_empty_bug(self):
# generated an exception in 0.4.3
x = DataFrame()
x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")
def test_join_unconsolidated(self):
# GH #331
a = DataFrame(randn(30, 2), columns=["a", "b"])
c = Series(randn(30))
a["c"] = c
d = DataFrame(randn(30, 1), columns=["q"])
# it works!
a.join(d)
d.join(a)
def test_join_multiindex(self):
index1 = MultiIndex.from_arrays(
[["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
names=["first", "second"],
)
index2 = MultiIndex.from_arrays(
[["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
names=["first", "second"],
)
df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])
df1 = df1.sort_index(level=0)
df2 = df2.sort_index(level=0)
joined = df1.join(df2, how="outer")
ex_index = Index(index1.values).union(Index(index2.values))
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
expected.index.names = index1.names
assert_frame_equal(joined, expected)
assert joined.index.names == index1.names
df1 = df1.sort_index(level=1)
df2 = df2.sort_index(level=1)
joined = df1.join(df2, how="outer").sort_index(level=0)
ex_index = Index(index1.values).union(Index(index2.values))
expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
expected.index.names = index1.names
assert_frame_equal(joined, expected)
assert joined.index.names == index1.names
def test_join_inner_multiindex(self):
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = [
"two",
"one",
"three",
"one",
"two",
"one",
"two",
"two",
"three",
"one",
]
data = np.random.randn(len(key1))
data = DataFrame({"key1": key1, "key2": key2, "data": data})
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
to_join = DataFrame(
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
)
joined = data.join(to_join, on=["key1", "key2"], how="inner")
expected = merge(
data,
to_join.reset_index(),
left_on=["key1", "key2"],
right_on=["first", "second"],
how="inner",
sort=False,
)
expected2 = merge(
to_join,
data,
right_on=["key1", "key2"],
left_index=True,
how="inner",
sort=False,
)
assert_frame_equal(joined, expected2.reindex_like(joined))
expected2 = merge(
to_join,
data,
right_on=["key1", "key2"],
left_index=True,
how="inner",
sort=False,
)
expected = expected.drop(["first", "second"], axis=1)
expected.index = joined.index
assert joined.index.is_monotonic
assert_frame_equal(joined, expected)
# _assert_same_contents(expected, expected2.loc[:, expected.columns])
def test_join_hierarchical_mixed(self):
# GH 2024
df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
other_df.set_index("a", inplace=True)
# GH 9455, 12219
with tm.assert_produces_warning(UserWarning):
result = merge(new_df, other_df, left_index=True, right_index=True)
assert ("b", "mean") in result
assert "b" in result
def test_join_float64_float32(self):
a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64)
b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32)
joined = a.join(b)
assert joined.dtypes["a"] == "float64"
assert joined.dtypes["b"] == "float64"
assert joined.dtypes["c"] == "float32"
a = np.random.randint(0, 5, 100).astype("int64")
b = np.random.random(100).astype("float64")
c = np.random.random(100).astype("float32")
df = DataFrame({"a": a, "b": b, "c": c})
xpdf = DataFrame({"a": a, "b": b, "c": c})
s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
rs = df.merge(s, left_on="a", right_index=True)
assert rs.dtypes["a"] == "int64"
assert rs.dtypes["b"] == "float64"
assert rs.dtypes["c"] == "float32"
assert rs.dtypes["md"] == "float32"
xp = xpdf.merge(s, left_on="a", right_index=True)
assert_frame_equal(rs, xp)
def test_join_many_non_unique_index(self):
df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
idf1 = df1.set_index(["a", "b"])
idf2 = df2.set_index(["a", "b"])
idf3 = df3.set_index(["a", "b"])
result = idf1.join([idf2, idf3], how="outer")
df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")
result = result.reset_index()
expected = expected[result.columns]
expected["a"] = expected.a.astype("int64")
expected["b"] = expected.b.astype("int64")
assert_frame_equal(result, expected)
df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
idf1 = df1.set_index(["a", "b"])
idf2 = df2.set_index(["a", "b"])
idf3 = df3.set_index(["a", "b"])
result = idf1.join([idf2, idf3], how="inner")
df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")
result = result.reset_index()
assert_frame_equal(result, expected.loc[:, result.columns])
# GH 11519
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.randn(8),
"D": np.random.randn(8),
}
)
s = Series(
np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
)
inner = df.join(s, how="inner")
outer = df.join(s, how="outer")
left = df.join(s, how="left")
right = df.join(s, how="right")
assert_frame_equal(inner, outer)
assert_frame_equal(inner, left)
assert_frame_equal(inner, right)
def test_join_sort(self):
left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
joined = left.join(right, on="key", sort=True)
expected = DataFrame(
{
"key": ["bar", "baz", "foo", "foo"],
"value": [2, 3, 1, 4],
"value2": ["a", "b", "c", "c"],
},
index=[1, 2, 0, 3],
)
assert_frame_equal(joined, expected)
# smoke test
joined = left.join(right, on="key", sort=False)
tm.assert_index_equal(joined.index, pd.Index(list(range(4))))
def test_join_mixed_non_unique_index(self):
# GH 12814, unorderable types in py3 with a non-unique index
df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
result = df1.join(df2)
expected = DataFrame(
{"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
index=[1, 2, 3, 3, "a"],
)
tm.assert_frame_equal(result, expected)
df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
result = df3.join(df4)
expected = DataFrame(
{"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
)
tm.assert_frame_equal(result, expected)
def test_join_non_unique_period_index(self):
# GH #16871
index = pd.period_range("2016-01-01", periods=16, freq="M")
df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"])
df2 = concat([df, df])
result = df.join(df2, how="inner", rsuffix="_df2")
expected = DataFrame(
np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
columns=["pnum", "pnum_df2"],
index=df2.sort_index().index,
)
tm.assert_frame_equal(result, expected)
def test_mixed_type_join_with_suffix(self):
# GH #916
df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
df.insert(0, "id", 0)
df.insert(5, "dt", "foo")
grouped = df.groupby("id")
mn = grouped.mean()
cn = grouped.count()
# it works!
mn.join(cn, rsuffix="_right")
def test_join_many(self):
df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]
joined = df_list[0].join(df_list[1:])
tm.assert_frame_equal(joined, df)
df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]
def _check_diff_index(df_list, result, exp_index):
reindexed = [x.reindex(exp_index) for x in df_list]
expected = reindexed[0].join(reindexed[1:])
tm.assert_frame_equal(result, expected)
# different join types
joined = df_list[0].join(df_list[1:], how="outer")
_check_diff_index(df_list, joined, df.index)
joined = df_list[0].join(df_list[1:])
_check_diff_index(df_list, joined, df_list[0].index)
joined = df_list[0].join(df_list[1:], how="inner")
_check_diff_index(df_list, joined, df.index[2:8])
msg = "Joining multiple DataFrames only supported for joining on index"
with pytest.raises(ValueError, match=msg):
df_list[0].join(df_list[1:], on="a")
def test_join_many_mixed(self):
df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
df["key"] = ["foo", "bar"] * 4
df1 = df.loc[:, ["A", "B"]]
df2 = df.loc[:, ["C", "D"]]
df3 = df.loc[:, ["key"]]
result = df1.join([df2, df3])
assert_frame_equal(result, df)
def test_join_dups(self):
# joining dups
df = concat(
[
DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
DataFrame(
np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
),
],
axis=1,
)
expected = concat([df, df], axis=1)
result = df.join(df, rsuffix="_2")
result.columns = expected.columns
assert_frame_equal(result, expected)
# GH 4975, invalid join on dups
w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
dta = x.merge(y, left_index=True, right_index=True).merge(
z, left_index=True, right_index=True, how="outer"
)
dta = dta.merge(w, left_index=True, right_index=True)
expected = concat([x, y, z, w], axis=1)
expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"]
assert_frame_equal(dta, expected)
def test_join_multi_to_multi(self, join_type):
# GH 20475
leftindex = MultiIndex.from_product(
[list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
)
left = DataFrame({"v1": range(12)}, index=leftindex)
rightindex = MultiIndex.from_product(
[list("abc"), list("xy")], names=["abc", "xy"]
)
right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)
result = left.join(right, on=["abc", "xy"], how=join_type)
expected = (
left.reset_index()
.merge(right.reset_index(), on=["abc", "xy"], how=join_type)
.set_index(["abc", "xy", "num"])
)
assert_frame_equal(expected, result)
msg = (
r"len\(left_on\) must equal the number of levels in the index" ' of "right"'
)
with pytest.raises(ValueError, match=msg):
left.join(right, on="xy", how=join_type)
with pytest.raises(ValueError, match=msg):
right.join(left, on=["abc", "xy"], how=join_type)
def test_join_on_tz_aware_datetimeindex(self):
# GH 23931, 26335
df1 = pd.DataFrame(
{
"date": pd.date_range(
start="2018-01-01", periods=5, tz="America/Chicago"
),
"vals": list("abcde"),
}
)
df2 = pd.DataFrame(
{
"date": pd.date_range(
start="2018-01-03", periods=5, tz="America/Chicago"
),
"vals_2": list("tuvwx"),
}
)
result = df1.join(df2.set_index("date"), on="date")
expected = df1.copy()
expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object)
assert_frame_equal(result, expected)
def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
# some smoke tests
for c in join_col:
assert result[c].notna().all()
left_grouped = left.groupby(join_col)
right_grouped = right.groupby(join_col)
for group_key, group in result.groupby(join_col):
l_joined = _restrict_to_columns(group, left.columns, lsuffix)
r_joined = _restrict_to_columns(group, right.columns, rsuffix)
try:
lgroup = left_grouped.get_group(group_key)
except KeyError:
if how in ("left", "inner"):
raise AssertionError(
"key {group_key!s} should not have been in the join".format(
group_key=group_key
)
)
_assert_all_na(l_joined, left.columns, join_col)
else:
_assert_same_contents(l_joined, lgroup)
try:
rgroup = right_grouped.get_group(group_key)
except KeyError:
if how in ("right", "inner"):
raise AssertionError(
"key {group_key!s} should not have been in the join".format(
group_key=group_key
)
)
_assert_all_na(r_joined, right.columns, join_col)
else:
_assert_same_contents(r_joined, rgroup)
def _restrict_to_columns(group, columns, suffix):
found = [
c for c in group.columns if c in columns or c.replace(suffix, "") in columns
]
# filter
group = group.loc[:, found]
# get rid of suffixes, if any
group = group.rename(columns=lambda x: x.replace(suffix, ""))
# put in the right order...
group = group.loc[:, columns]
return group
def _assert_same_contents(join_chunk, source):
NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
rows = {tuple(row) for row in jvalues}
assert len(rows) == len(source)
assert all(tuple(row) in rows for row in svalues)
def _assert_all_na(join_chunk, source_columns, join_col):
for c in source_columns:
if c in join_col:
continue
assert join_chunk[c].isna().all()
def _join_by_hand(a, b, how="left"):
join_index = a.index.join(b.index, how=how)
a_re = a.reindex(join_index)
b_re = b.reindex(join_index)
result_columns = a.columns.append(b.columns)
for col, s in b_re.items():
a_re[col] = s
return a_re.reindex(columns=result_columns)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,188 @@
import numpy as np
import pytest
from pandas import DataFrame
from pandas.util.testing import assert_frame_equal
@pytest.fixture
def df1():
return DataFrame(
dict(
outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4],
inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2],
v1=np.linspace(0, 1, 11),
)
)
@pytest.fixture
def df2():
return DataFrame(
dict(
outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3],
inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3],
v2=np.linspace(10, 11, 12),
)
)
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def left_df(request, df1):
""" Construct left test DataFrame with specified levels
(any of 'outer', 'inner', and 'v1')"""
levels = request.param
if levels:
df1 = df1.set_index(levels)
return df1
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def right_df(request, df2):
""" Construct right test DataFrame with specified levels
(any of 'outer', 'inner', and 'v2')"""
levels = request.param
if levels:
df2 = df2.set_index(levels)
return df2
def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None):
"""
Compute the expected merge result for the test case.
This method computes the expected result of merging two DataFrames on
a combination of their columns and index levels. It does so by
explicitly dropping/resetting their named index levels, performing a
merge on their columns, and then finally restoring the appropriate
index in the result.
Parameters
----------
df_left : DataFrame
The left DataFrame (may have zero or more named index levels)
df_right : DataFrame
The right DataFrame (may have zero or more named index levels)
on : list of str
The on parameter to the merge operation
left_on : list of str
The left_on parameter to the merge operation
right_on : list of str
The right_on parameter to the merge operation
how : str
The how parameter to the merge operation
Returns
-------
DataFrame
The expected merge result
"""
# Handle on param if specified
if on is not None:
left_on, right_on = on, on
# Compute input named index levels
left_levels = [n for n in df_left.index.names if n is not None]
right_levels = [n for n in df_right.index.names if n is not None]
# Compute output named index levels
output_levels = [i for i in left_on if i in right_levels and i in left_levels]
# Drop index levels that aren't involved in the merge
drop_left = [n for n in left_levels if n not in left_on]
if drop_left:
df_left = df_left.reset_index(drop_left, drop=True)
drop_right = [n for n in right_levels if n not in right_on]
if drop_right:
df_right = df_right.reset_index(drop_right, drop=True)
# Convert remaining index levels to columns
reset_left = [n for n in left_levels if n in left_on]
if reset_left:
df_left = df_left.reset_index(level=reset_left)
reset_right = [n for n in right_levels if n in right_on]
if reset_right:
df_right = df_right.reset_index(level=reset_right)
# Perform merge
expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how)
# Restore index levels
if output_levels:
expected = expected.set_index(output_levels)
return expected
@pytest.mark.parametrize(
"on,how",
[
(["outer"], "inner"),
(["inner"], "left"),
(["outer", "inner"], "right"),
(["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_on(left_df, right_df, on, how):
# Construct expected result
expected = compute_expected(left_df, right_df, on=on, how=how)
# Perform merge
result = left_df.merge(right_df, on=on, how=how)
assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize(
"left_on,right_on,how",
[
(["outer"], ["outer"], "inner"),
(["inner"], ["inner"], "right"),
(["outer", "inner"], ["outer", "inner"], "left"),
(["inner", "outer"], ["inner", "outer"], "outer"),
],
)
def test_merge_indexes_and_columns_lefton_righton(
left_df, right_df, left_on, right_on, how
):
# Construct expected result
expected = compute_expected(
left_df, right_df, left_on=left_on, right_on=right_on, how=how
)
# Perform merge
result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how)
assert_frame_equal(result, expected, check_like=True)
@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]])
def test_join_indexes_and_columns_on(df1, df2, left_index, join_type):
# Construct left_df
left_df = df1.set_index(left_index)
# Construct right_df
right_df = df2.set_index(["outer", "inner"])
# Result
expected = (
left_df.reset_index()
.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
.set_index(left_index)
)
# Perform join
result = left_df.join(
right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y"
)
assert_frame_equal(result, expected, check_like=True)

View File

@@ -0,0 +1,117 @@
from numpy import nan
import pytest
import pandas as pd
from pandas import DataFrame, merge_ordered
from pandas.util.testing import assert_frame_equal
class TestMergeOrdered:
def setup_method(self, method):
self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]})
self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]})
def test_basic(self):
result = merge_ordered(self.left, self.right, on="key")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1, nan, 2, nan, 3, nan],
"rvalue": [nan, 1, 2, 3, nan, 4],
}
)
assert_frame_equal(result, expected)
def test_ffill(self):
result = merge_ordered(self.left, self.right, on="key", fill_method="ffill")
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"],
"lvalue": [1.0, 1, 2, 2, 3, 3.0],
"rvalue": [nan, 1, 2, 3, 3, 4],
}
)
assert_frame_equal(result, expected)
def test_multigroup(self):
left = pd.concat([self.left, self.left], ignore_index=True)
left["group"] = ["a"] * 3 + ["b"] * 3
result = merge_ordered(
left, self.right, on="key", left_by="group", fill_method="ffill"
)
expected = DataFrame(
{
"key": ["a", "b", "c", "d", "e", "f"] * 2,
"lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2,
"rvalue": [nan, 1, 2, 3, 3, 4] * 2,
}
)
expected["group"] = ["a"] * 6 + ["b"] * 6
assert_frame_equal(result, expected.loc[:, result.columns])
result2 = merge_ordered(
self.right, left, on="key", right_by="group", fill_method="ffill"
)
assert_frame_equal(result, result2.loc[:, result.columns])
result = merge_ordered(left, self.right, on="key", left_by="group")
assert result["group"].notna().all()
def test_merge_type(self):
class NotADataFrame(DataFrame):
@property
def _constructor(self):
return NotADataFrame
nad = NotADataFrame(self.left)
result = nad.merge(self.right, on="key")
assert isinstance(result, NotADataFrame)
def test_empty_sequence_concat(self):
# GH 9157
empty_pat = "[Nn]o objects"
none_pat = "objects.*None"
test_cases = [
((), empty_pat),
([], empty_pat),
({}, empty_pat),
([None], none_pat),
([None, None], none_pat),
]
for df_seq, pattern in test_cases:
with pytest.raises(ValueError, match=pattern):
pd.concat(df_seq)
pd.concat([pd.DataFrame()])
pd.concat([None, pd.DataFrame()])
pd.concat([pd.DataFrame(), None])
def test_doc_example(self):
left = DataFrame(
{
"group": list("aaabbb"),
"key": ["a", "c", "e", "a", "c", "e"],
"lvalue": [1, 2, 3] * 2,
}
)
right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
result = merge_ordered(left, right, fill_method="ffill", left_by="group")
expected = DataFrame(
{
"group": list("aaaaabbbbb"),
"key": ["a", "b", "c", "d", "e"] * 2,
"lvalue": [1, 1, 2, 2, 3] * 2,
"rvalue": [nan, 1, 2, 3, 3] * 2,
}
)
assert_frame_equal(result, expected)

View File

@@ -0,0 +1,810 @@
from collections import OrderedDict
import numpy as np
from numpy import nan
from numpy.random import randn
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
import pandas.util.testing as tm
@pytest.fixture
def left():
"""left dataframe (not multi-indexed) for multi-index join tests"""
# a little relevant example with NAs
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]
data = np.random.randn(len(key1))
return DataFrame({"key1": key1, "key2": key2, "data": data})
@pytest.fixture
def right():
"""right dataframe (multi-indexed) for multi-index join tests"""
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["key1", "key2"],
)
return DataFrame(
np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
)
@pytest.fixture
def left_multi():
return DataFrame(
dict(
Origin=["A", "A", "B", "B", "C"],
Destination=["A", "B", "A", "C", "A"],
Period=["AM", "AM", "IP", "AM", "OP"],
TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"],
Trips=[1987, 3647, 2470, 4296, 4444],
),
columns=["Origin", "Destination", "Period", "TripPurp", "Trips"],
).set_index(["Origin", "Destination", "Period", "TripPurp"])
@pytest.fixture
def right_multi():
return DataFrame(
dict(
Origin=["A", "A", "B", "B", "C", "C", "E"],
Destination=["A", "B", "A", "B", "A", "B", "F"],
Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"],
LinkType=["a", "b", "c", "b", "a", "b", "a"],
Distance=[100, 80, 90, 80, 75, 35, 55],
),
columns=["Origin", "Destination", "Period", "LinkType", "Distance"],
).set_index(["Origin", "Destination", "Period", "LinkType"])
@pytest.fixture
def on_cols_multi():
return ["Origin", "Destination", "Period"]
@pytest.fixture
def idx_cols_multi():
return ["Origin", "Destination", "Period", "TripPurp", "LinkType"]
class TestMergeMulti:
def setup_method(self):
self.index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
self.to_join = DataFrame(
np.random.randn(10, 3),
index=self.index,
columns=["j_one", "j_two", "j_three"],
)
# a little relevant example with NAs
key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
key2 = [
"two",
"one",
"three",
"one",
"two",
"one",
"two",
"two",
"three",
"one",
]
data = np.random.randn(len(key1))
self.data = DataFrame({"key1": key1, "key2": key2, "data": data})
def test_merge_on_multikey(self, left, right, join_type):
on_cols = ["key1", "key2"]
result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True)
expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index(
drop=True
)
expected = pd.merge(
left, right.reset_index(), on=on_cols, how=join_type, sort=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("sort", [False, True])
def test_left_join_multi_index(self, left, right, sort):
icols = ["1st", "2nd", "3rd"]
def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord("a")
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4
def run_asserts(left, right, sort):
res = left.join(right, on=icols, how="left", sort=sort)
assert len(left) < len(res) + 1
assert not res["4th"].isna().any()
assert not res["5th"].isna().any()
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
result = bind_cols(res.iloc[:, :-2])
tm.assert_series_equal(res["4th"], result, check_names=False)
assert result.name is None
if sort:
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
res.index = np.arange(len(res))
tm.assert_frame_equal(out, res)
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"])
left.insert(1, "2nd", np.random.randint(0, 1000, len(left)))
i = np.random.permutation(len(left))
right = left.iloc[i].copy()
left["4th"] = bind_cols(left)
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
# inject some nulls
left.loc[1::23, "1st"] = np.nan
left.loc[2::37, "2nd"] = np.nan
left.loc[3::43, "3rd"] = np.nan
left["4th"] = bind_cols(left)
i = np.random.permutation(len(left))
right = left.iloc[i, :-1]
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
@pytest.mark.parametrize("sort", [False, True])
def test_merge_right_vs_left(self, left, right, sort):
# compare left vs right merge with multikey
on_cols = ["key1", "key2"]
merged_left_right = left.merge(
right, left_on=on_cols, right_index=True, how="left", sort=sort
)
merge_right_left = right.merge(
left, right_on=on_cols, left_index=True, how="right", sort=sort
)
# Reorder columns
merge_right_left = merge_right_left[merged_left_right.columns]
tm.assert_frame_equal(merged_left_right, merge_right_left)
def test_compress_group_combinations(self):
# ~ 40000000 possible unique groups
key1 = tm.rands_array(10, 10000)
key1 = np.tile(key1, 2)
key2 = key1[::-1]
df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})
df2 = DataFrame(
{"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}
)
# just to hit the label compression code path
merge(df, df2, how="outer")
def test_left_join_index_preserve_order(self):
on_cols = ["k1", "k2"]
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"v": np.array(np.arange(24), dtype=np.int64),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result.sort_values(on_cols, kind="mergesort", inplace=True)
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
# test join with multi dtypes blocks
left = DataFrame(
{
"k1": [0, 1, 2] * 8,
"k2": ["foo", "bar"] * 12,
"k3": np.array([0, 1, 2] * 8, dtype=np.float32),
"v": np.array(np.arange(24), dtype=np.int32),
}
)
index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
right = DataFrame({"v2": [5, 7]}, index=index)
result = left.join(right, on=on_cols)
expected = left.copy()
expected["v2"] = np.nan
expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
tm.assert_frame_equal(result, expected)
result = result.sort_values(on_cols, kind="mergesort")
expected = left.join(right, on=on_cols, sort=True)
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match_multiindex(self):
left = DataFrame(
[
["X", "Y", "C", "a"],
["W", "Y", "C", "e"],
["V", "Q", "A", "h"],
["V", "R", "D", "i"],
["X", "Y", "D", "b"],
["X", "Y", "A", "c"],
["W", "Q", "B", "f"],
["W", "R", "C", "g"],
["V", "Y", "C", "j"],
["X", "Y", "B", "d"],
],
columns=["cola", "colb", "colc", "tag"],
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
)
right = DataFrame(
[
["W", "R", "C", 0],
["W", "Q", "B", 3],
["W", "Q", "B", 8],
["X", "Y", "A", 1],
["X", "Y", "A", 4],
["X", "Y", "B", 5],
["X", "Y", "C", 6],
["X", "Y", "C", 9],
["X", "Q", "C", -6],
["X", "R", "C", -9],
["V", "Y", "C", 7],
["V", "R", "D", 2],
["V", "R", "D", -1],
["V", "Q", "A", -3],
],
columns=["col1", "col2", "col3", "val"],
).set_index(["col1", "col2", "col3"])
result = left.join(right, on=["cola", "colb", "colc"], how="left")
expected = DataFrame(
[
["X", "Y", "C", "a", 6],
["X", "Y", "C", "a", 9],
["W", "Y", "C", "e", nan],
["V", "Q", "A", "h", -3],
["V", "R", "D", "i", 2],
["V", "R", "D", "i", -1],
["X", "Y", "D", "b", nan],
["X", "Y", "A", "c", 1],
["X", "Y", "A", "c", 4],
["W", "Q", "B", "f", 3],
["W", "Q", "B", "f", 8],
["W", "R", "C", "g", 0],
["V", "Y", "C", "j", 7],
["X", "Y", "B", "d", 5],
],
columns=["cola", "colb", "colc", "tag", "val"],
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True)
expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort")
tm.assert_frame_equal(result, expected)
def test_left_join_index_multi_match(self):
left = DataFrame(
[["c", 0], ["b", 1], ["a", 2], ["b", 3]],
columns=["tag", "val"],
index=[2, 0, 1, 3],
)
right = DataFrame(
[
["a", "v"],
["c", "w"],
["c", "x"],
["d", "y"],
["a", "z"],
["c", "r"],
["e", "q"],
["c", "s"],
],
columns=["tag", "char"],
).set_index("tag")
result = left.join(right, on="tag", how="left")
expected = DataFrame(
[
["c", 0, "w"],
["c", 0, "x"],
["c", 0, "r"],
["c", 0, "s"],
["b", 1, nan],
["a", 2, "v"],
["a", 2, "z"],
["b", 3, nan],
],
columns=["tag", "val", "char"],
index=[2, 2, 2, 2, 0, 1, 1, 3],
)
tm.assert_frame_equal(result, expected)
result = left.join(right, on="tag", how="left", sort=True)
expected2 = expected.sort_values("tag", kind="mergesort")
tm.assert_frame_equal(result, expected2)
# GH7331 - maintain left frame order in left merge
result = merge(left, right.reset_index(), how="left", on="tag")
expected.index = np.arange(len(expected))
tm.assert_frame_equal(result, expected)
def test_left_merge_na_buglet(self):
left = DataFrame(
{
"id": list("abcde"),
"v1": randn(5),
"v2": randn(5),
"dummy": list("abcde"),
"v3": randn(5),
},
columns=["id", "v1", "v2", "dummy", "v3"],
)
right = DataFrame(
{
"id": ["a", "b", np.nan, np.nan, np.nan],
"sv3": [1.234, 5.678, np.nan, np.nan, np.nan],
}
)
result = merge(left, right, on="id", how="left")
rdf = right.drop(["id"], axis=1)
expected = left.join(rdf)
tm.assert_frame_equal(result, expected)
def test_merge_na_keys(self):
data = [
[1950, "A", 1.5],
[1950, "B", 1.5],
[1955, "B", 1.5],
[1960, "B", np.nan],
[1970, "B", 4.0],
[1950, "C", 4.0],
[1960, "C", np.nan],
[1965, "C", 3.0],
[1970, "C", 4.0],
]
frame = DataFrame(data, columns=["year", "panel", "data"])
other_data = [
[1960, "A", np.nan],
[1970, "A", np.nan],
[1955, "A", np.nan],
[1965, "A", np.nan],
[1965, "B", np.nan],
[1955, "C", np.nan],
]
other = DataFrame(other_data, columns=["year", "panel", "data"])
result = frame.merge(other, how="outer")
expected = frame.fillna(-999).merge(other.fillna(-999), how="outer")
expected = expected.replace(-999, np.nan)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, klass):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if klass is not None:
on_vector = klass(on_vector)
expected = DataFrame(
OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])])
)
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame(
OrderedDict(
[("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])]
)
)
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
def test_join_multi_levels(self):
# GH 3662
# merge multi-levels
household = DataFrame(
dict(
household_id=[1, 2, 3],
male=[0, 1, 0],
wealth=[196087.3, 316478.7, 294750],
),
columns=["household_id", "male", "wealth"],
).set_index("household_id")
portfolio = DataFrame(
dict(
household_id=[1, 2, 2, 3, 3, 3, 4],
asset_id=[
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
name=[
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
np.nan,
],
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
),
columns=["household_id", "asset_id", "name", "share"],
).set_index(["household_id", "asset_id"])
result = household.join(portfolio, how="inner")
expected = (
DataFrame(
dict(
male=[0, 1, 1, 0, 0, 0],
wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0],
name=[
"ABN Amro",
"Robeco",
"Royal Dutch Shell",
"Royal Dutch Shell",
"AAB Eastern Europe Equity Fund",
"Postbank BioTech Fonds",
],
share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
household_id=[1, 2, 2, 3, 3, 3],
asset_id=[
"nl0000301109",
"nl0000289783",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
],
)
)
.set_index(["household_id", "asset_id"])
.reindex(columns=["male", "wealth", "name", "share"])
)
tm.assert_frame_equal(result, expected)
# equivalency
result = merge(
household.reset_index(),
portfolio.reset_index(),
on=["household_id"],
how="inner",
).set_index(["household_id", "asset_id"])
tm.assert_frame_equal(result, expected)
result = household.join(portfolio, how="outer")
expected = concat(
[
expected,
(
DataFrame(
dict(share=[1.00]),
index=MultiIndex.from_tuples(
[(4, np.nan)], names=["household_id", "asset_id"]
),
)
),
],
axis=0,
sort=True,
).reindex(columns=expected.columns)
tm.assert_frame_equal(result, expected)
# invalid cases
household.index.name = "foo"
with pytest.raises(ValueError):
household.join(portfolio, how="inner")
portfolio2 = portfolio.copy()
portfolio2.index.set_names(["household_id", "foo"])
with pytest.raises(ValueError):
portfolio2.join(portfolio, how="inner")
def test_join_multi_levels2(self):
# some more advanced merges
# GH6360
household = DataFrame(
dict(
household_id=[1, 2, 2, 3, 3, 3, 4],
asset_id=[
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"nl0000289965",
np.nan,
],
share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
),
columns=["household_id", "asset_id", "share"],
).set_index(["household_id", "asset_id"])
log_return = DataFrame(
dict(
asset_id=[
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
t=[233, 234, 235, 180, 181],
log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997],
)
).set_index(["asset_id", "t"])
expected = (
DataFrame(
dict(
household_id=[2, 2, 2, 3, 3, 3, 3, 3],
asset_id=[
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
],
t=[233, 234, 235, 233, 234, 235, 180, 181],
share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
log_return=[
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
],
)
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
# this is the equivalency
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="inner",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
expected = (
DataFrame(
dict(
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
asset_id=[
"nl0000301109",
"nl0000301109",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"gb00b03mlx29",
"lu0197800237",
"lu0197800237",
"nl0000289965",
None,
],
t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
share=[
1.0,
0.4,
0.6,
0.6,
0.6,
0.15,
0.15,
0.15,
0.6,
0.6,
0.25,
1.0,
],
log_return=[
None,
None,
0.09604978,
-0.06524096,
0.03532373,
0.09604978,
-0.06524096,
0.03532373,
0.03025441,
0.036997,
None,
None,
],
)
)
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=["share", "log_return"])
)
result = merge(
household.reset_index(),
log_return.reset_index(),
on=["asset_id"],
how="outer",
).set_index(["household_id", "asset_id", "t"])
tm.assert_frame_equal(result, expected)
class TestJoinMultiMulti:
def test_join_multi_multi(
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
):
# Multi-index join tests
expected = (
pd.merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(idx_cols_multi)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
def test_join_multi_empty_frames(
self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi
):
left_multi = left_multi.drop(columns=left_multi.columns)
right_multi = right_multi.drop(columns=right_multi.columns)
expected = (
pd.merge(
left_multi.reset_index(),
right_multi.reset_index(),
how=join_type,
on=on_cols_multi,
)
.set_index(idx_cols_multi)
.sort_index()
)
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, box):
# see gh-19038
df = DataFrame(
[1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]
)
df.index = pd.to_datetime(df.index)
on_vector = df.index.year
if box is not None:
on_vector = box(on_vector)
expected = DataFrame(
OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])])
)
result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)
expected = DataFrame(
OrderedDict(
[("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])]
)
)
result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)
def test_single_common_level(self):
index_left = pd.MultiIndex.from_tuples(
[("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"]
)
left = pd.DataFrame(
{"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left
)
index_right = pd.MultiIndex.from_tuples(
[("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"]
)
right = pd.DataFrame(
{"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]},
index=index_right,
)
result = left.join(right)
expected = pd.merge(
left.reset_index(), right.reset_index(), on=["key"], how="inner"
).set_index(["key", "X", "Y"])
tm.assert_frame_equal(result, expected)