8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,155 @@
import numpy as np
from pandas.util._decorators import cache_readonly
import pandas as pd
import pandas.util.testing as tm
_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()
_frame = pd.DataFrame(_seriesd)
_frame2 = pd.DataFrame(_seriesd, columns=["D", "C", "B", "A"])
_intframe = pd.DataFrame({k: v.astype(int) for k, v in _seriesd.items()})
_tsframe = pd.DataFrame(_tsd)
_mixed_frame = _frame.copy()
_mixed_frame["foo"] = "bar"
class TestData:
@cache_readonly
def frame(self):
return _frame.copy()
@cache_readonly
def frame2(self):
return _frame2.copy()
@cache_readonly
def intframe(self):
# force these all to int64 to avoid platform testing issues
return pd.DataFrame({c: s for c, s in _intframe.items()}, dtype=np.int64)
@cache_readonly
def tsframe(self):
return _tsframe.copy()
@cache_readonly
def mixed_frame(self):
return _mixed_frame.copy()
@cache_readonly
def mixed_float(self):
return pd.DataFrame(
{
"A": _frame["A"].copy().astype("float32"),
"B": _frame["B"].copy().astype("float32"),
"C": _frame["C"].copy().astype("float16"),
"D": _frame["D"].copy().astype("float64"),
}
)
@cache_readonly
def mixed_float2(self):
return pd.DataFrame(
{
"A": _frame2["A"].copy().astype("float32"),
"B": _frame2["B"].copy().astype("float32"),
"C": _frame2["C"].copy().astype("float16"),
"D": _frame2["D"].copy().astype("float64"),
}
)
@cache_readonly
def mixed_int(self):
return pd.DataFrame(
{
"A": _intframe["A"].copy().astype("int32"),
"B": np.ones(len(_intframe["B"]), dtype="uint64"),
"C": _intframe["C"].copy().astype("uint8"),
"D": _intframe["D"].copy().astype("int64"),
}
)
@cache_readonly
def all_mixed(self):
return pd.DataFrame(
{
"a": 1.0,
"b": 2,
"c": "foo",
"float32": np.array([1.0] * 10, dtype="float32"),
"int32": np.array([1] * 10, dtype="int32"),
},
index=np.arange(10),
)
@cache_readonly
def tzframe(self):
result = pd.DataFrame(
{
"A": pd.date_range("20130101", periods=3),
"B": pd.date_range("20130101", periods=3, tz="US/Eastern"),
"C": pd.date_range("20130101", periods=3, tz="CET"),
}
)
result.iloc[1, 1] = pd.NaT
result.iloc[1, 2] = pd.NaT
return result
@cache_readonly
def empty(self):
return pd.DataFrame()
@cache_readonly
def ts1(self):
return tm.makeTimeSeries(nper=30)
@cache_readonly
def ts2(self):
return tm.makeTimeSeries(nper=30)[5:]
@cache_readonly
def simple(self):
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
return pd.DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])
# self.ts3 = tm.makeTimeSeries()[-5:]
# self.ts4 = tm.makeTimeSeries()[1:-1]
def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
dtypes = dict(A="float32", B="float32", C="float16", D="float64")
if isinstance(dtype, str):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get("A"):
assert df.dtypes["A"] == dtypes["A"]
if dtypes.get("B"):
assert df.dtypes["B"] == dtypes["B"]
if dtypes.get("C"):
assert df.dtypes["C"] == dtypes["C"]
if dtypes.get("D"):
assert df.dtypes["D"] == dtypes["D"]
def _check_mixed_int(df, dtype=None):
dtypes = dict(A="int32", B="uint64", C="uint8", D="int64")
if isinstance(dtype, str):
dtypes = {k: dtype for k, v in dtypes.items()}
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get("A"):
assert df.dtypes["A"] == dtypes["A"]
if dtypes.get("B"):
assert df.dtypes["B"] == dtypes["B"]
if dtypes.get("C"):
assert df.dtypes["C"] == dtypes["C"]
if dtypes.get("D"):
assert df.dtypes["D"] == dtypes["D"]

View File

@@ -0,0 +1,330 @@
import numpy as np
import pytest
from pandas import DataFrame, NaT, date_range
import pandas.util.testing as tm
@pytest.fixture
def float_frame_with_na():
"""
Fixture for DataFrame of floats with index of unique strings
Columns are ['A', 'B', 'C', 'D']; some entries are missing
A B C D
ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997
DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872
neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522
0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018
3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826
soujjZ0A08 NaN NaN NaN NaN
7W6NLGsjB9 NaN NaN NaN NaN
... ... ... ... ...
uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590
n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717
ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189
uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503
3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947
2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083
sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517
[30 rows x 4 columns]
"""
df = DataFrame(tm.getSeriesData())
# set some NAs
df.loc[5:10] = np.nan
df.loc[15:20, -2:] = np.nan
return df
@pytest.fixture
def bool_frame_with_na():
"""
Fixture for DataFrame of booleans with index of unique strings
Columns are ['A', 'B', 'C', 'D']; some entries are missing
A B C D
zBZxY2IDGd False False False False
IhBWBMWllt False True True True
ctjdvZSR6R True False True True
AVTujptmxb False True False True
G9lrImrSWq False False False True
sFFwdIUfz2 NaN NaN NaN NaN
s15ptEJnRb NaN NaN NaN NaN
... ... ... ... ...
UW41KkDyZ4 True True False False
l9l6XkOdqV True False False False
X2MeZfzDYA False True False False
xWkIKU7vfX False True False True
QOhL6VmpGU False False False True
22PwkRJdat False True False False
kfboQ3VeIK True False True False
[30 rows x 4 columns]
"""
df = DataFrame(tm.getSeriesData()) > 0
df = df.astype(object)
# set some NAs
df.loc[5:10] = np.nan
df.loc[15:20, -2:] = np.nan
return df
@pytest.fixture
def int_frame():
"""
Fixture for DataFrame of ints with index of unique strings
Columns are ['A', 'B', 'C', 'D']
A B C D
vpBeWjM651 1 0 1 0
5JyxmrP1En -1 0 0 0
qEDaoD49U2 -1 1 0 0
m66TkTfsFe 0 0 0 0
EHPaNzEUFm -1 0 -1 0
fpRJCevQhi 2 0 0 0
OlQvnmfi3Q 0 0 -2 0
... .. .. .. ..
uB1FPlz4uP 0 0 0 1
EcSe6yNzCU 0 0 -1 0
L50VudaiI8 -1 1 -2 0
y3bpw4nwIp 0 -1 0 0
H0RdLLwrCT 1 1 0 0
rY82K0vMwm 0 0 0 0
1OPIUjnkjk 2 0 0 0
[30 rows x 4 columns]
"""
df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
# force these all to int64 to avoid platform testing issues
return DataFrame({c: s for c, s in df.items()}, dtype=np.int64)
@pytest.fixture
def datetime_frame():
"""
Fixture for DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']
A B C D
2000-01-03 -1.122153 0.468535 0.122226 1.693711
2000-01-04 0.189378 0.486100 0.007864 -1.216052
2000-01-05 0.041401 -0.835752 -0.035279 -0.414357
2000-01-06 0.430050 0.894352 0.090719 0.036939
2000-01-07 -0.620982 -0.668211 -0.706153 1.466335
2000-01-10 -0.752633 0.328434 -0.815325 0.699674
2000-01-11 -2.236969 0.615737 -0.829076 -1.196106
... ... ... ... ...
2000-02-03 1.642618 -0.579288 0.046005 1.385249
2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351
2000-02-07 -2.656149 -0.601387 1.410148 0.444150
2000-02-08 -1.201881 -1.289040 0.772992 -1.445300
2000-02-09 1.377373 0.398619 1.008453 -0.928207
2000-02-10 0.473194 -0.636677 0.984058 0.511519
2000-02-11 -0.965556 0.408313 -1.312844 -0.381948
[30 rows x 4 columns]
"""
return DataFrame(tm.getTimeSeriesData())
@pytest.fixture
def float_string_frame():
"""
Fixture for DataFrame of floats and strings with index of unique strings
Columns are ['A', 'B', 'C', 'D', 'foo'].
A B C D foo
w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar
PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar
ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar
3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar
khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar
LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar
HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar
... ... ... ... ... ...
9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar
h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar
mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar
oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar
9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar
jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar
lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar
[30 rows x 5 columns]
"""
df = DataFrame(tm.getSeriesData())
df["foo"] = "bar"
return df
@pytest.fixture
def mixed_float_frame():
"""
Fixture for DataFrame of different float types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
A B C D
GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993
KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588
VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731
kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607
CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266
0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541
tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710
... ... ... ... ...
7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237
4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612
B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653
hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427
1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827
9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204
xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502
[30 rows x 4 columns]
"""
df = DataFrame(tm.getSeriesData())
df.A = df.A.astype("float32")
df.B = df.B.astype("float32")
df.C = df.C.astype("float16")
df.D = df.D.astype("float64")
return df
@pytest.fixture
def mixed_int_frame():
"""
Fixture for DataFrame of different int types with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
A B C D
mUrCZ67juP 0 1 2 2
rw99ACYaKS 0 1 0 0
7QsEcpaaVU 0 1 1 1
xkrimI2pcE 0 1 0 0
dz01SuzoS8 0 1 255 255
ccQkqOHX75 -1 1 0 0
DN0iXaoDLd 0 1 0 0
... .. .. ... ...
Dfb141wAaQ 1 1 254 254
IPD8eQOVu5 0 1 0 0
CcaKulsCmv 0 1 0 0
rIBa8gu7E5 0 1 0 0
RP6peZmh5o 0 1 1 1
NMb9pipQWQ 0 1 0 0
PqgbJEzjib 0 1 3 3
[30 rows x 4 columns]
"""
df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
df.A = df.A.astype("int32")
df.B = np.ones(len(df.B), dtype="uint64")
df.C = df.C.astype("uint8")
df.D = df.C.astype("int64")
return df
@pytest.fixture
def mixed_type_frame():
"""
Fixture for DataFrame of float/int/string columns with RangeIndex
Columns are ['a', 'b', 'c', 'float32', 'int32'].
"""
return DataFrame(
{
"a": 1.0,
"b": 2,
"c": "foo",
"float32": np.array([1.0] * 10, dtype="float32"),
"int32": np.array([1] * 10, dtype="int32"),
},
index=np.arange(10),
)
@pytest.fixture
def timezone_frame():
"""
Fixture for DataFrame of date_range Series with different time zones
Columns are ['A', 'B', 'C']; some entries are missing
A B C
0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00
1 2013-01-02 NaT NaT
2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00
"""
df = DataFrame(
{
"A": date_range("20130101", periods=3),
"B": date_range("20130101", periods=3, tz="US/Eastern"),
"C": date_range("20130101", periods=3, tz="CET"),
}
)
df.iloc[1, 1] = NaT
df.iloc[1, 2] = NaT
return df
@pytest.fixture
def uint64_frame():
"""
Fixture for DataFrame with uint64 values
Columns are ['A', 'B']
"""
return DataFrame(
{"A": np.arange(3), "B": [2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10]}, dtype=np.uint64
)
@pytest.fixture
def simple_frame():
"""
Fixture for simple 3x3 DataFrame
Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
one two three
a 1.0 2.0 3.0
b 4.0 5.0 6.0
c 7.0 8.0 9.0
"""
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])
@pytest.fixture
def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
A B C D E (tuple, as, label)
0 foo one a 0.608477 -0.012500 -1.664297
1 foo two b -0.633460 0.249614 -0.364411
2 foo three c 0.615256 2.154968 -0.834666
3 bar one d 0.234246 1.085675 0.718445
4 bar two e 0.533841 -0.005702 -3.533912
"""
df = DataFrame(
{
"A": ["foo", "foo", "foo", "bar", "bar"],
"B": ["one", "two", "three", "one", "two"],
"C": ["a", "b", "c", "d", "e"],
"D": np.random.randn(5),
"E": np.random.randn(5),
("tuple", "as", "label"): np.random.randn(5),
}
)
return df

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,589 @@
from copy import deepcopy
import datetime
import pydoc
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
SparseDataFrame,
SparseDtype,
compat,
date_range,
timedelta_range,
)
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal,
assert_frame_equal,
assert_series_equal,
)
class SharedWithSparse:
"""
A collection of tests DataFrame and SparseDataFrame can share.
In generic tests on this class, use ``self._assert_frame_equal()`` and
``self._assert_series_equal()`` which are implemented in sub-classes
and dispatch correctly.
"""
def _assert_frame_equal(self, left, right):
"""Dispatch to frame class dependent assertion"""
raise NotImplementedError
def _assert_series_equal(self, left, right):
"""Dispatch to series class dependent assertion"""
raise NotImplementedError
def test_copy_index_name_checking(self, float_frame):
# don't want to be able to modify the index stored elsewhere after
# making a copy
for attr in ("index", "columns"):
ind = getattr(float_frame, attr)
ind.name = None
cp = float_frame.copy()
getattr(cp, attr).name = "foo"
assert getattr(float_frame, attr).name is None
def test_getitem_pop_assign_name(self, float_frame):
s = float_frame["A"]
assert s.name == "A"
s = float_frame.pop("A")
assert s.name == "A"
s = float_frame.loc[:, "B"]
assert s.name == "B"
s2 = s.loc[:]
assert s2.name == "B"
def test_get_value(self, float_frame):
for idx in float_frame.index:
for col in float_frame.columns:
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = float_frame.get_value(idx, col)
expected = float_frame[col][idx]
tm.assert_almost_equal(result, expected)
def test_add_prefix_suffix(self, float_frame):
with_prefix = float_frame.add_prefix("foo#")
expected = pd.Index(["foo#{c}".format(c=c) for c in float_frame.columns])
tm.assert_index_equal(with_prefix.columns, expected)
with_suffix = float_frame.add_suffix("#foo")
expected = pd.Index(["{c}#foo".format(c=c) for c in float_frame.columns])
tm.assert_index_equal(with_suffix.columns, expected)
with_pct_prefix = float_frame.add_prefix("%")
expected = pd.Index(["%{c}".format(c=c) for c in float_frame.columns])
tm.assert_index_equal(with_pct_prefix.columns, expected)
with_pct_suffix = float_frame.add_suffix("%")
expected = pd.Index(["{c}%".format(c=c) for c in float_frame.columns])
tm.assert_index_equal(with_pct_suffix.columns, expected)
def test_get_axis(self, float_frame):
f = float_frame
assert f._get_axis_number(0) == 0
assert f._get_axis_number(1) == 1
assert f._get_axis_number("index") == 0
assert f._get_axis_number("rows") == 0
assert f._get_axis_number("columns") == 1
assert f._get_axis_name(0) == "index"
assert f._get_axis_name(1) == "columns"
assert f._get_axis_name("index") == "index"
assert f._get_axis_name("rows") == "index"
assert f._get_axis_name("columns") == "columns"
assert f._get_axis(0) is f.index
assert f._get_axis(1) is f.columns
with pytest.raises(ValueError, match="No axis named"):
f._get_axis_number(2)
with pytest.raises(ValueError, match="No axis.*foo"):
f._get_axis_name("foo")
with pytest.raises(ValueError, match="No axis.*None"):
f._get_axis_name(None)
with pytest.raises(ValueError, match="No axis named"):
f._get_axis_number(None)
def test_keys(self, float_frame):
getkeys = float_frame.keys
assert getkeys() is float_frame.columns
def test_column_contains_raises(self, float_frame):
with pytest.raises(TypeError, match="unhashable type: 'Index'"):
float_frame.columns in float_frame
def test_tab_completion(self):
# DataFrame whose columns are identifiers shall have them in __dir__.
df = pd.DataFrame([list("abcd"), list("efgh")], columns=list("ABCD"))
for key in list("ABCD"):
assert key in dir(df)
assert isinstance(df.__getitem__("A"), pd.Series)
# DataFrame whose first-level columns are identifiers shall have
# them in __dir__.
df = pd.DataFrame(
[list("abcd"), list("efgh")],
columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))),
)
for key in list("ABCD"):
assert key in dir(df)
for key in list("EFGH"):
assert key not in dir(df)
assert isinstance(df.__getitem__("A"), pd.DataFrame)
def test_not_hashable(self):
empty_frame = DataFrame()
df = self.klass([1])
msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be hashed"
with pytest.raises(TypeError, match=msg):
hash(df)
with pytest.raises(TypeError, match=msg):
hash(empty_frame)
def test_new_empty_index(self):
df1 = self.klass(np.random.randn(0, 3))
df2 = self.klass(np.random.randn(0, 3))
df1.index.name = "foo"
assert df2.index.name is None
def test_array_interface(self, float_frame):
with np.errstate(all="ignore"):
result = np.sqrt(float_frame)
assert isinstance(result, type(float_frame))
assert result.index is float_frame.index
assert result.columns is float_frame.columns
self._assert_frame_equal(result, float_frame.apply(np.sqrt))
def test_get_agg_axis(self, float_frame):
cols = float_frame._get_agg_axis(0)
assert cols is float_frame.columns
idx = float_frame._get_agg_axis(1)
assert idx is float_frame.index
msg = r"Axis must be 0 or 1 \(got 2\)"
with pytest.raises(ValueError, match=msg):
float_frame._get_agg_axis(2)
def test_nonzero(self, float_frame, float_string_frame):
empty_frame = DataFrame()
assert empty_frame.empty
assert not float_frame.empty
assert not float_string_frame.empty
# corner case
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}, index=np.arange(3))
del df["A"]
assert not df.empty
def test_iteritems(self):
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
for k, v in df.items():
assert isinstance(v, self.klass._constructor_sliced)
def test_items(self):
# GH 17213, GH 13918
cols = ["a", "b", "c"]
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
for c, (k, v) in zip(cols, df.items()):
assert c == k
assert isinstance(v, Series)
assert (df[k] == v).all()
def test_iter(self, float_frame):
assert tm.equalContents(list(float_frame), float_frame.columns)
def test_iterrows(self, float_frame, float_string_frame):
for k, v in float_frame.iterrows():
exp = float_frame.loc[k]
self._assert_series_equal(v, exp)
for k, v in float_string_frame.iterrows():
exp = float_string_frame.loc[k]
self._assert_series_equal(v, exp)
def test_iterrows_iso8601(self):
# GH 19671
if self.klass == SparseDataFrame:
pytest.xfail(reason="SparseBlock datetime type not implemented.")
s = self.klass(
{
"non_iso8601": ["M1701", "M1802", "M1903", "M2004"],
"iso8601": date_range("2000-01-01", periods=4, freq="M"),
}
)
for k, v in s.iterrows():
exp = s.loc[k]
self._assert_series_equal(v, exp)
def test_iterrows_corner(self):
# gh-12222
df = DataFrame(
{
"a": [datetime.datetime(2015, 1, 1)],
"b": [None],
"c": [None],
"d": [""],
"e": [[]],
"f": [set()],
"g": [{}],
}
)
expected = Series(
[datetime.datetime(2015, 1, 1), None, None, "", [], set(), {}],
index=list("abcdefg"),
name=0,
dtype="object",
)
_, result = next(df.iterrows())
tm.assert_series_equal(result, expected)
def test_itertuples(self, float_frame):
for i, tup in enumerate(float_frame.itertuples()):
s = self.klass._constructor_sliced(tup[1:])
s.name = tup[0]
expected = float_frame.iloc[i, :].reset_index(drop=True)
self._assert_series_equal(s, expected)
df = self.klass(
{"floats": np.random.randn(5), "ints": range(5)}, columns=["floats", "ints"]
)
for tup in df.itertuples(index=False):
assert isinstance(tup[1], int)
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
dfaa = df[["a", "a"]]
assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]
# repr with int on 32-bit/windows
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
assert (
repr(list(df.itertuples(name=None)))
== "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
)
tup = next(df.itertuples(name="TestName"))
assert tup._fields == ("Index", "a", "b")
assert (tup.Index, tup.a, tup.b) == tup
assert type(tup).__name__ == "TestName"
df.columns = ["def", "return"]
tup2 = next(df.itertuples(name="TestName"))
assert tup2 == (0, 1, 4)
assert tup2._fields == ("Index", "_1", "_2")
df3 = DataFrame({"f" + str(i): [i] for i in range(1024)})
# will raise SyntaxError if trying to create namedtuple
tup3 = next(df3.itertuples())
assert not hasattr(tup3, "_fields")
assert isinstance(tup3, tuple)
def test_sequence_like_with_categorical(self):
# GH 7839
# make sure can iterate
df = DataFrame(
{"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df["grade"] = Categorical(df["raw_grade"])
# basic sequencing testing
result = list(df.grade.values)
expected = np.array(df.grade.values).tolist()
tm.assert_almost_equal(result, expected)
# iteration
for t in df.itertuples(index=False):
str(t)
for row, s in df.iterrows():
str(s)
for c, col in df.items():
str(s)
def test_len(self, float_frame):
assert len(float_frame) == len(float_frame.index)
def test_values(self, float_frame, float_string_frame):
frame = float_frame
arr = frame.values
frame_cols = frame.columns
for i, row in enumerate(arr):
for j, value in enumerate(row):
col = frame_cols[j]
if np.isnan(value):
assert np.isnan(frame[col][i])
else:
assert value == frame[col][i]
# mixed type
arr = float_string_frame[["foo", "A"]].values
assert arr[0, 0] == "bar"
df = self.klass({"complex": [1j, 2j, 3j], "real": [1, 2, 3]})
arr = df.values
assert arr[0, 0] == 1j
# single block corner case
arr = float_frame[["A", "B"]].values
expected = float_frame.reindex(columns=["A", "B"]).values
assert_almost_equal(arr, expected)
def test_to_numpy(self):
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
expected = np.array([[1, 3], [2, 4.5]])
result = df.to_numpy()
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_dtype(self):
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
expected = np.array([[1, 3], [2, 4]], dtype="int64")
result = df.to_numpy(dtype="int64")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_copy(self):
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr)
assert df.values.base is arr
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is None
def test_transpose(self, float_frame):
frame = float_frame
dft = frame.T
for idx, series in dft.items():
for col, value in series.items():
if np.isnan(value):
assert np.isnan(frame[col][idx])
else:
assert value == frame[col][idx]
# mixed type
index, data = tm.getMixedTypeDict()
mixed = self.klass(data, index=index)
mixed_T = mixed.T
for col, s in mixed_T.items():
assert s.dtype == np.object_
def test_swapaxes(self):
df = self.klass(np.random.randn(10, 5))
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
self._assert_frame_equal(df, df.swapaxes(0, 0))
msg = (
"No axis named 2 for object type"
r" <class 'pandas.core(.sparse)?.frame.(Sparse)?DataFrame'>"
)
with pytest.raises(ValueError, match=msg):
df.swapaxes(2, 5)
def test_axis_aliases(self, float_frame):
f = float_frame
# reg name
expected = f.sum(axis=0)
result = f.sum(axis="index")
assert_series_equal(result, expected)
expected = f.sum(axis=1)
result = f.sum(axis="columns")
assert_series_equal(result, expected)
def test_class_axis(self):
# GH 18147
# no exception and no empty docstring
assert pydoc.getdoc(DataFrame.index)
assert pydoc.getdoc(DataFrame.columns)
def test_more_values(self, float_string_frame):
values = float_string_frame.values
assert values.shape[1] == len(float_string_frame.columns)
def test_repr_with_mi_nat(self, float_string_frame):
df = self.klass(
{"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]]
)
result = repr(df)
expected = " X\nNaT a 1\n2013-01-01 b 2"
assert result == expected
def test_items_names(self, float_string_frame):
for k, v in float_string_frame.items():
assert v.name == k
def test_series_put_names(self, float_string_frame):
series = float_string_frame._series
for k, v in series.items():
assert v.name == k
def test_empty_nonzero(self):
df = self.klass([1, 2, 3])
assert not df.empty
df = self.klass(index=[1], columns=[1])
assert not df.empty
df = self.klass(index=["a", "b"], columns=["c", "d"]).dropna()
assert df.empty
assert df.T.empty
empty_frames = [
self.klass(),
self.klass(index=[1]),
self.klass(columns=[1]),
self.klass({1: []}),
]
for df in empty_frames:
assert df.empty
assert df.T.empty
def test_with_datetimelikes(self):
df = self.klass(
{
"A": date_range("20130101", periods=10),
"B": timedelta_range("1 day", periods=10),
}
)
t = df.T
result = t.dtypes.value_counts()
if self.klass is DataFrame:
expected = Series({np.dtype("object"): 10})
else:
expected = Series({SparseDtype(dtype=object): 10})
tm.assert_series_equal(result, expected)
class TestDataFrameMisc(SharedWithSparse):
klass = DataFrame
# SharedWithSparse tests use generic, klass-agnostic assertion
_assert_frame_equal = staticmethod(assert_frame_equal)
_assert_series_equal = staticmethod(assert_series_equal)
def test_values(self, float_frame):
float_frame.values[:, 0] = 5.0
assert (float_frame.values[:, 0] == 5).all()
def test_as_matrix_deprecated(self, float_frame):
# GH 18458
with tm.assert_produces_warning(FutureWarning):
cols = float_frame.columns.tolist()
result = float_frame.as_matrix(columns=cols)
expected = float_frame.values
tm.assert_numpy_array_equal(result, expected)
def test_deepcopy(self, float_frame):
cp = deepcopy(float_frame)
series = cp["A"]
series[:] = 10
for idx, value in series.items():
assert float_frame["A"][idx] != value
def test_transpose_get_view(self, float_frame):
dft = float_frame.T
dft.values[:, 5:10] = 5
assert (float_frame.values[5:10] == 5).all()
def test_inplace_return_self(self):
# GH 1893
data = DataFrame(
{"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]}
)
def _check_f(base, f):
result = f(base)
assert result is None
# -----DataFrame-----
# set_index
f = lambda x: x.set_index("a", inplace=True)
_check_f(data.copy(), f)
# reset_index
f = lambda x: x.reset_index(inplace=True)
_check_f(data.set_index("a"), f)
# drop_duplicates
f = lambda x: x.drop_duplicates(inplace=True)
_check_f(data.copy(), f)
# sort
f = lambda x: x.sort_values("b", inplace=True)
_check_f(data.copy(), f)
# sort_index
f = lambda x: x.sort_index(inplace=True)
_check_f(data.copy(), f)
# fillna
f = lambda x: x.fillna(0, inplace=True)
_check_f(data.copy(), f)
# replace
f = lambda x: x.replace(1, 0, inplace=True)
_check_f(data.copy(), f)
# rename
f = lambda x: x.rename({1: "foo"}, inplace=True)
_check_f(data.copy(), f)
# -----Series-----
d = data.copy()["c"]
# reset_index
f = lambda x: x.reset_index(inplace=True, drop=True)
_check_f(data.set_index("a")["c"], f)
# fillna
f = lambda x: x.fillna(0, inplace=True)
_check_f(d.copy(), f)
# replace
f = lambda x: x.replace(1, 0, inplace=True)
_check_f(d.copy(), f)
# rename
f = lambda x: x.rename({1: "foo"}, inplace=True)
_check_f(d.copy(), f)
def test_tab_complete_warning(self, ip):
# GH 16409
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; df = pd.DataFrame()"
ip.run_code(code)
with tm.assert_produces_warning(None):
with provisionalcompleter("ignore"):
list(ip.Completer.completions("df.", 1))
def test_get_values_deprecated(self):
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
with tm.assert_produces_warning(FutureWarning):
res = df.get_values()
tm.assert_numpy_array_equal(res, df.values)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,644 @@
from collections import deque
from datetime import datetime
import operator
import numpy as np
import pytest
import pandas as pd
from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int
import pandas.util.testing as tm
# -------------------------------------------------------------------
# Comparisons
class TestFrameComparisons:
# Specifically _not_ flex-comparisons
def test_comparison_invalid(self):
def check(df, df2):
for (x, y) in [(df, df2), (df2, df)]:
# we expect the result to match Series comparisons for
# == and !=, inequalities should raise
result = x == y
expected = pd.DataFrame(
{col: x[col] == y[col] for col in x.columns},
index=x.index,
columns=x.columns,
)
tm.assert_frame_equal(result, expected)
result = x != y
expected = pd.DataFrame(
{col: x[col] != y[col] for col in x.columns},
index=x.index,
columns=x.columns,
)
tm.assert_frame_equal(result, expected)
with pytest.raises(TypeError):
x >= y
with pytest.raises(TypeError):
x > y
with pytest.raises(TypeError):
x < y
with pytest.raises(TypeError):
x <= y
# GH4968
# invalid date/int comparisons
df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"])
df["dates"] = pd.date_range("20010101", periods=len(df))
df2 = df.copy()
df2["dates"] = df["a"]
check(df, df2)
df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"])
df2 = pd.DataFrame(
{
"a": pd.date_range("20010101", periods=len(df)),
"b": pd.date_range("20100101", periods=len(df)),
}
)
check(df, df2)
def test_timestamp_compare(self):
# make sure we can compare Timestamps on the right AND left hand side
# GH#4982
df = pd.DataFrame(
{
"dates1": pd.date_range("20010101", periods=10),
"dates2": pd.date_range("20010102", periods=10),
"intcol": np.random.randint(1000000000, size=10),
"floatcol": np.random.randn(10),
"stringcol": list(tm.rands(10)),
}
)
df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT
ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"}
for left, right in ops.items():
left_f = getattr(operator, left)
right_f = getattr(operator, right)
# no nats
if left in ["eq", "ne"]:
expected = left_f(df, pd.Timestamp("20010109"))
result = right_f(pd.Timestamp("20010109"), df)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(TypeError):
left_f(df, pd.Timestamp("20010109"))
with pytest.raises(TypeError):
right_f(pd.Timestamp("20010109"), df)
# nats
expected = left_f(df, pd.Timestamp("nat"))
result = right_f(pd.Timestamp("nat"), df)
tm.assert_frame_equal(result, expected)
def test_mixed_comparison(self):
# GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
# not raise TypeError
# (this appears to be fixed before GH#22163, not sure when)
df = pd.DataFrame([["1989-08-01", 1], ["1989-08-01", 2]])
other = pd.DataFrame([["a", "b"], ["c", "d"]])
result = df == other
assert not result.any().any()
result = df != other
assert result.all().all()
def test_df_boolean_comparison_error(self):
# GH#4576, GH#22880
# comparing DataFrame against list/tuple with len(obj) matching
# len(df.columns) is supported as of GH#22800
df = pd.DataFrame(np.arange(6).reshape((3, 2)))
expected = pd.DataFrame([[False, False], [True, False], [False, False]])
result = df == (2, 2)
tm.assert_frame_equal(result, expected)
result = df == [2, 2]
tm.assert_frame_equal(result, expected)
def test_df_float_none_comparison(self):
df = pd.DataFrame(
np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"]
)
result = df.__eq__(None)
assert not result.any().any()
def test_df_string_comparison(self):
df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
mask_a = df.a > 1
tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
mask_b = df.b == "foo"
tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
class TestFrameFlexComparisons:
# TODO: test_bool_flex_frame needs a better name
def test_bool_flex_frame(self):
data = np.random.randn(5, 3)
other_data = np.random.randn(5, 3)
df = pd.DataFrame(data)
other = pd.DataFrame(other_data)
ndim_5 = np.ones(df.shape + (1, 3))
# Unaligned
def _check_unaligned_frame(meth, op, df, other):
part_o = other.loc[3:, 1:].copy()
rs = meth(part_o)
xp = op(df, part_o.reindex(index=df.index, columns=df.columns))
tm.assert_frame_equal(rs, xp)
# DataFrame
assert df.eq(df).values.all()
assert not df.ne(df).values.any()
for op in ["eq", "ne", "gt", "lt", "ge", "le"]:
f = getattr(df, op)
o = getattr(operator, op)
# No NAs
tm.assert_frame_equal(f(other), o(df, other))
_check_unaligned_frame(f, o, df, other)
# ndarray
tm.assert_frame_equal(f(other.values), o(df, other.values))
# scalar
tm.assert_frame_equal(f(0), o(df, 0))
# NAs
msg = "Unable to coerce to Series/DataFrame"
tm.assert_frame_equal(f(np.nan), o(df, np.nan))
with pytest.raises(ValueError, match=msg):
f(ndim_5)
# Series
def _test_seq(df, idx_ser, col_ser):
idx_eq = df.eq(idx_ser, axis=0)
col_eq = df.eq(col_ser)
idx_ne = df.ne(idx_ser, axis=0)
col_ne = df.ne(col_ser)
tm.assert_frame_equal(col_eq, df == pd.Series(col_ser))
tm.assert_frame_equal(col_eq, -col_ne)
tm.assert_frame_equal(idx_eq, -idx_ne)
tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
tm.assert_frame_equal(col_eq, df.eq(list(col_ser)))
tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0))
tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))
idx_gt = df.gt(idx_ser, axis=0)
col_gt = df.gt(col_ser)
idx_le = df.le(idx_ser, axis=0)
col_le = df.le(col_ser)
tm.assert_frame_equal(col_gt, df > pd.Series(col_ser))
tm.assert_frame_equal(col_gt, -col_le)
tm.assert_frame_equal(idx_gt, -idx_le)
tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)
idx_ge = df.ge(idx_ser, axis=0)
col_ge = df.ge(col_ser)
idx_lt = df.lt(idx_ser, axis=0)
col_lt = df.lt(col_ser)
tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser))
tm.assert_frame_equal(col_ge, -col_lt)
tm.assert_frame_equal(idx_ge, -idx_lt)
tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)
idx_ser = pd.Series(np.random.randn(5))
col_ser = pd.Series(np.random.randn(3))
_test_seq(df, idx_ser, col_ser)
# list/tuple
_test_seq(df, idx_ser.values, col_ser.values)
# NA
df.loc[0, 0] = np.nan
rs = df.eq(df)
assert not rs.loc[0, 0]
rs = df.ne(df)
assert rs.loc[0, 0]
rs = df.gt(df)
assert not rs.loc[0, 0]
rs = df.lt(df)
assert not rs.loc[0, 0]
rs = df.ge(df)
assert not rs.loc[0, 0]
rs = df.le(df)
assert not rs.loc[0, 0]
# complex
arr = np.array([np.nan, 1, 6, np.nan])
arr2 = np.array([2j, np.nan, 7, None])
df = pd.DataFrame({"a": arr})
df2 = pd.DataFrame({"a": arr2})
rs = df.gt(df2)
assert not rs.values.any()
rs = df.ne(df2)
assert rs.values.all()
arr3 = np.array([2j, np.nan, None])
df3 = pd.DataFrame({"a": arr3})
rs = df3.gt(2j)
assert not rs.values.any()
# corner, dtype=object
df1 = pd.DataFrame({"col": ["foo", np.nan, "bar"]})
df2 = pd.DataFrame({"col": ["foo", datetime.now(), "bar"]})
result = df1.ne(df2)
exp = pd.DataFrame({"col": [False, True, False]})
tm.assert_frame_equal(result, exp)
def test_flex_comparison_nat(self):
# GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT,
# and _definitely_ not be NaN
df = pd.DataFrame([pd.NaT])
result = df == pd.NaT
# result.iloc[0, 0] is a np.bool_ object
assert result.iloc[0, 0].item() is False
result = df.eq(pd.NaT)
assert result.iloc[0, 0].item() is False
result = df != pd.NaT
assert result.iloc[0, 0].item() is True
result = df.ne(pd.NaT)
assert result.iloc[0, 0].item() is True
@pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"])
def test_df_flex_cmp_constant_return_types(self, opname):
# GH 15077, non-empty DataFrame
df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
const = 2
result = getattr(df, opname)(const).dtypes.value_counts()
tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)]))
@pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"])
def test_df_flex_cmp_constant_return_types_empty(self, opname):
# GH 15077 empty DataFrame
df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
const = 2
empty = df.iloc[:0]
result = getattr(empty, opname)(const).dtypes.value_counts()
tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)]))
# -------------------------------------------------------------------
# Arithmetic
class TestFrameFlexArithmetic:
def test_df_add_td64_columnwise(self):
# GH 22534 Check that column-wise addition broadcasts correctly
dti = pd.date_range("2016-01-01", periods=10)
tdi = pd.timedelta_range("1", periods=10)
tser = pd.Series(tdi)
df = pd.DataFrame({0: dti, 1: tdi})
result = df.add(tser, axis=0)
expected = pd.DataFrame({0: dti + tdi, 1: tdi + tdi})
tm.assert_frame_equal(result, expected)
def test_df_add_flex_filled_mixed_dtypes(self):
# GH 19611
dti = pd.date_range("2016-01-01", periods=3)
ser = pd.Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]")
df = pd.DataFrame({"A": dti, "B": ser})
other = pd.DataFrame({"A": ser, "B": ser})
fill = pd.Timedelta(days=1).to_timedelta64()
result = df.add(other, fill_value=fill)
expected = pd.DataFrame(
{
"A": pd.Series(
["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]"
),
"B": ser * 2,
}
)
tm.assert_frame_equal(result, expected)
def test_arith_flex_frame(
self, all_arithmetic_operators, float_frame, mixed_float_frame
):
# one instance of parametrized fixture
op = all_arithmetic_operators
def f(x, y):
# r-versions not in operator-stdlib; get op without "r" and invert
if op.startswith("__r"):
return getattr(operator, op.replace("__r", "__"))(y, x)
return getattr(operator, op)(x, y)
result = getattr(float_frame, op)(2 * float_frame)
expected = f(float_frame, 2 * float_frame)
tm.assert_frame_equal(result, expected)
# vs mix float
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
expected = f(mixed_float_frame, 2 * mixed_float_frame)
tm.assert_frame_equal(result, expected)
_check_mixed_float(result, dtype=dict(C=None))
@pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"])
def test_arith_flex_frame_mixed(
self, op, int_frame, mixed_int_frame, mixed_float_frame
):
f = getattr(operator, op)
# vs mix int
result = getattr(mixed_int_frame, op)(2 + mixed_int_frame)
expected = f(mixed_int_frame, 2 + mixed_int_frame)
# no overflow in the uint
dtype = None
if op in ["__sub__"]:
dtype = dict(B="uint64", C=None)
elif op in ["__add__", "__mul__"]:
dtype = dict(C=None)
tm.assert_frame_equal(result, expected)
_check_mixed_int(result, dtype=dtype)
# vs mix float
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
expected = f(mixed_float_frame, 2 * mixed_float_frame)
tm.assert_frame_equal(result, expected)
_check_mixed_float(result, dtype=dict(C=None))
# vs plain int
result = getattr(int_frame, op)(2 * int_frame)
expected = f(int_frame, 2 * int_frame)
tm.assert_frame_equal(result, expected)
def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame):
# one instance of parametrized fixture
op = all_arithmetic_operators
# Check that arrays with dim >= 3 raise
for dim in range(3, 6):
arr = np.ones((1,) * dim)
msg = "Unable to coerce to Series/DataFrame"
with pytest.raises(ValueError, match=msg):
getattr(float_frame, op)(arr)
def test_arith_flex_frame_corner(self, float_frame):
const_add = float_frame.add(1)
tm.assert_frame_equal(const_add, float_frame + 1)
# corner cases
result = float_frame.add(float_frame[:0])
tm.assert_frame_equal(result, float_frame * np.nan)
result = float_frame[:0].add(float_frame)
tm.assert_frame_equal(result, float_frame * np.nan)
with pytest.raises(NotImplementedError, match="fill_value"):
float_frame.add(float_frame.iloc[0], fill_value=3)
with pytest.raises(NotImplementedError, match="fill_value"):
float_frame.add(float_frame.iloc[0], axis="index", fill_value=3)
def test_arith_flex_series(self, simple_frame):
df = simple_frame
row = df.xs("a")
col = df["two"]
# after arithmetic refactor, add truediv here
ops = ["add", "sub", "mul", "mod"]
for op in ops:
f = getattr(df, op)
op = getattr(operator, op)
tm.assert_frame_equal(f(row), op(df, row))
tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T)
# special case for some reason
tm.assert_frame_equal(df.add(row, axis=None), df + row)
# cases which will be refactored after big arithmetic refactor
tm.assert_frame_equal(df.div(row), df / row)
tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T)
# broadcasting issue in GH 7325
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64")
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
result = df.div(df[0], axis="index")
tm.assert_frame_equal(result, expected)
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64")
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
result = df.div(df[0], axis="index")
tm.assert_frame_equal(result, expected)
def test_arith_flex_zero_len_raises(self):
# GH 19522 passing fill_value to frame flex arith methods should
# raise even in the zero-length special cases
ser_len0 = pd.Series([])
df_len0 = pd.DataFrame(columns=["A", "B"])
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
with pytest.raises(NotImplementedError, match="fill_value"):
df.add(ser_len0, fill_value="E")
with pytest.raises(NotImplementedError, match="fill_value"):
df_len0.sub(df["A"], axis=None, fill_value=3)
class TestFrameArithmetic:
def test_df_add_2d_array_rowlike_broadcasts(self):
# GH#23000
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
rowlike = arr[[1], :] # shape --> (1, ncols)
assert rowlike.shape == (1, df.shape[1])
expected = pd.DataFrame(
[[2, 4], [4, 6], [6, 8]],
columns=df.columns,
index=df.index,
# specify dtype explicitly to avoid failing
# on 32bit builds
dtype=arr.dtype,
)
result = df + rowlike
tm.assert_frame_equal(result, expected)
result = rowlike + df
tm.assert_frame_equal(result, expected)
def test_df_add_2d_array_collike_broadcasts(self):
# GH#23000
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
collike = arr[:, [1]] # shape --> (nrows, 1)
assert collike.shape == (df.shape[0], 1)
expected = pd.DataFrame(
[[1, 2], [5, 6], [9, 10]],
columns=df.columns,
index=df.index,
# specify dtype explicitly to avoid failing
# on 32bit builds
dtype=arr.dtype,
)
result = df + collike
tm.assert_frame_equal(result, expected)
result = collike + df
tm.assert_frame_equal(result, expected)
def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators):
# GH#23000
opname = all_arithmetic_operators
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
rowlike = arr[[1], :] # shape --> (1, ncols)
assert rowlike.shape == (1, df.shape[1])
exvals = [
getattr(df.loc["A"], opname)(rowlike.squeeze()),
getattr(df.loc["B"], opname)(rowlike.squeeze()),
getattr(df.loc["C"], opname)(rowlike.squeeze()),
]
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index)
if opname in ["__rmod__", "__rfloordiv__"]:
# exvals will have dtypes [f8, i8, i8] so expected will be
# all-f8, but the DataFrame operation will return mixed dtypes
# use exvals[-1].dtype instead of "i8" for compat with 32-bit
# systems/pythons
expected[False] = expected[False].astype(exvals[-1].dtype)
result = getattr(df, opname)(rowlike)
tm.assert_frame_equal(result, expected)
def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators):
# GH#23000
opname = all_arithmetic_operators
arr = np.arange(6).reshape(3, 2)
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
collike = arr[:, [1]] # shape --> (nrows, 1)
assert collike.shape == (df.shape[0], 1)
exvals = {
True: getattr(df[True], opname)(collike.squeeze()),
False: getattr(df[False], opname)(collike.squeeze()),
}
dtype = None
if opname in ["__rmod__", "__rfloordiv__"]:
# Series ops may return mixed int/float dtypes in cases where
# DataFrame op will return all-float. So we upcast `expected`
dtype = np.common_type(*[x.values for x in exvals.values()])
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype)
result = getattr(df, opname)(collike)
tm.assert_frame_equal(result, expected)
def test_df_bool_mul_int(self):
# GH 22047, GH 22163 multiplication by 1 should result in int dtype,
# not object dtype
df = pd.DataFrame([[False, True], [False, False]])
result = df * 1
# On appveyor this comes back as np.int32 instead of np.int64,
# so we check dtype.kind instead of just dtype
kinds = result.dtypes.apply(lambda x: x.kind)
assert (kinds == "i").all()
result = 1 * df
kinds = result.dtypes.apply(lambda x: x.kind)
assert (kinds == "i").all()
def test_arith_mixed(self):
left = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]})
result = left + left
expected = pd.DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]})
tm.assert_frame_equal(result, expected)
def test_arith_getitem_commute(self):
df = pd.DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]})
def _test_op(df, op):
result = op(df, 1)
if not df.columns.is_unique:
raise ValueError("Only unique columns supported by this test")
for col in result.columns:
tm.assert_series_equal(result[col], op(df[col], 1))
_test_op(df, operator.add)
_test_op(df, operator.sub)
_test_op(df, operator.mul)
_test_op(df, operator.truediv)
_test_op(df, operator.floordiv)
_test_op(df, operator.pow)
_test_op(df, lambda x, y: y + x)
_test_op(df, lambda x, y: y - x)
_test_op(df, lambda x, y: y * x)
_test_op(df, lambda x, y: y / x)
_test_op(df, lambda x, y: y ** x)
_test_op(df, lambda x, y: x + y)
_test_op(df, lambda x, y: x - y)
_test_op(df, lambda x, y: x * y)
_test_op(df, lambda x, y: x / y)
_test_op(df, lambda x, y: x ** y)
@pytest.mark.parametrize(
"values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])]
)
def test_arith_alignment_non_pandas_object(self, values):
# GH#17901
df = pd.DataFrame({"A": [1, 1], "B": [1, 1]})
expected = pd.DataFrame({"A": [2, 2], "B": [3, 3]})
result = df + values
tm.assert_frame_equal(result, expected)
def test_arith_non_pandas_object(self):
df = pd.DataFrame(
np.arange(1, 10, dtype="f8").reshape(3, 3),
columns=["one", "two", "three"],
index=["a", "b", "c"],
)
val1 = df.xs("a").values
added = pd.DataFrame(df.values + val1, index=df.index, columns=df.columns)
tm.assert_frame_equal(df + val1, added)
added = pd.DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns)
tm.assert_frame_equal(df.add(val1, axis=0), added)
val2 = list(df["two"])
added = pd.DataFrame(df.values + val2, index=df.index, columns=df.columns)
tm.assert_frame_equal(df + val2, added)
added = pd.DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns)
tm.assert_frame_equal(df.add(val2, axis="index"), added)
val3 = np.random.rand(*df.shape)
added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns)
tm.assert_frame_equal(df.add(val3), added)

View File

@@ -0,0 +1,134 @@
import numpy as np
import pytest
from pandas import DataFrame, Series, Timestamp, date_range, to_datetime
import pandas.util.testing as tm
@pytest.fixture
def date_range_frame():
"""
Fixture for DataFrame of ints with date_range index
Columns are ['A', 'B'].
"""
N = 50
rng = date_range("1/1/1990", periods=N, freq="53s")
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
class TestFrameAsof:
def test_basic(self, date_range_frame):
df = date_range_frame
N = 50
df.loc[15:30, "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
result = df.asof(dates)
assert result.notna().all(1).all()
lb = df.index[14]
ub = df.index[30]
dates = list(dates)
result = df.asof(dates)
assert result.notna().all(1).all()
mask = (result.index >= lb) & (result.index < ub)
rs = result[mask]
assert (rs == 14).all(1).all()
def test_subset(self, date_range_frame):
N = 10
df = date_range_frame.iloc[:N].copy()
df.loc[4:8, "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
# with a subset of A should be the same
result = df.asof(dates, subset="A")
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# same with A/B
result = df.asof(dates, subset=["A", "B"])
expected = df.asof(dates)
tm.assert_frame_equal(result, expected)
# B gives df.asof
result = df.asof(dates, subset="B")
expected = df.resample("25s", closed="right").ffill().reindex(dates)
expected.iloc[20:] = 9
tm.assert_frame_equal(result, expected)
def test_missing(self, date_range_frame):
# GH 15118
# no match found - `where` value before earliest date in index
N = 10
df = date_range_frame.iloc[:N].copy()
result = df.asof("1989-12-31")
expected = Series(index=["A", "B"], name=Timestamp("1989-12-31"))
tm.assert_series_equal(result, expected)
result = df.asof(to_datetime(["1989-12-31"]))
expected = DataFrame(
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
)
tm.assert_frame_equal(result, expected)
def test_all_nans(self, date_range_frame):
# GH 15713
# DataFrame is all nans
result = DataFrame([np.nan]).asof([0])
expected = DataFrame([np.nan])
tm.assert_frame_equal(result, expected)
# testing non-default indexes, multiple inputs
N = 150
rng = date_range_frame.index
dates = date_range("1/1/1990", periods=N, freq="25s")
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=["A"])
tm.assert_frame_equal(result, expected)
# testing multiple columns
dates = date_range("1/1/1990", periods=N, freq="25s")
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
# testing scalar input
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
tm.assert_frame_equal(result, expected)
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
expected = Series(np.nan, index=["A", "B"], name=3)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"stamp,expected",
[
(
Timestamp("2018-01-01 23:22:43.325+00:00"),
Series(2.0, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
),
(
Timestamp("2018-01-01 22:33:20.682+01:00"),
Series(1.0, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
),
],
)
def test_time_zone_aware_index(self, stamp, expected):
# GH21194
# Testing awareness of DataFrame index considering different
# UTC and timezone
df = DataFrame(
data=[1, 2],
index=[
Timestamp("2018-01-01 21:00:05.001+00:00"),
Timestamp("2018-01-01 22:35:10.550+00:00"),
],
)
result = df.asof(stamp)
tm.assert_series_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,639 @@
from datetime import datetime, timedelta
from io import StringIO
import itertools
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
Timestamp,
compat,
date_range,
option_context,
)
from pandas.core.arrays import IntervalArray, integer_array
from pandas.core.internals import ObjectBlock
from pandas.core.internals.blocks import IntBlock
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal,
assert_frame_equal,
assert_series_equal,
)
# Segregated collection of methods that require the BlockManager internal data
# structure
class TestDataFrameBlockInternals:
def test_setitem_invalidates_datetime_index_freq(self):
# GH#24096 altering a datetime64tz column inplace invalidates the
# `freq` attribute on the underlying DatetimeIndex
dti = date_range("20130101", periods=3, tz="US/Eastern")
ts = dti[1]
df = DataFrame({"B": dti})
assert df["B"]._values.freq == "D"
df.iloc[1, 0] = pd.NaT
assert df["B"]._values.freq is None
# check that the DatetimeIndex was not altered in place
assert dti.freq == "D"
assert dti[1] == ts
def test_cast_internals(self, float_frame):
casted = DataFrame(float_frame._data, dtype=int)
expected = DataFrame(float_frame._series, dtype=int)
assert_frame_equal(casted, expected)
casted = DataFrame(float_frame._data, dtype=np.int32)
expected = DataFrame(float_frame._series, dtype=np.int32)
assert_frame_equal(casted, expected)
def test_consolidate(self, float_frame):
float_frame["E"] = 7.0
consolidated = float_frame._consolidate()
assert len(consolidated._data.blocks) == 1
# Ensure copy, do I want this?
recons = consolidated._consolidate()
assert recons is not consolidated
tm.assert_frame_equal(recons, consolidated)
float_frame["F"] = 8.0
assert len(float_frame._data.blocks) == 3
float_frame._consolidate(inplace=True)
assert len(float_frame._data.blocks) == 1
def test_consolidate_inplace(self, float_frame):
frame = float_frame.copy() # noqa
# triggers in-place consolidation
for letter in range(ord("A"), ord("Z")):
float_frame[chr(letter)] = chr(letter)
def test_values_consolidate(self, float_frame):
float_frame["E"] = 7.0
assert not float_frame._data.is_consolidated()
_ = float_frame.values # noqa
assert float_frame._data.is_consolidated()
def test_modify_values(self, float_frame):
float_frame.values[5] = 5
assert (float_frame.values[5] == 5).all()
# unconsolidated
float_frame["E"] = 7.0
float_frame.values[6] = 6
assert (float_frame.values[6] == 6).all()
def test_boolean_set_uncons(self, float_frame):
float_frame["E"] = 7.0
expected = float_frame.values.copy()
expected[expected > 1] = 2
float_frame[float_frame > 1] = 2
assert_almost_equal(expected, float_frame.values)
def test_values_numeric_cols(self, float_frame):
float_frame["foo"] = "bar"
values = float_frame[["A", "B", "C", "D"]].values
assert values.dtype == np.float64
def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
# mixed lcd
values = mixed_float_frame[["A", "B", "C", "D"]].values
assert values.dtype == np.float64
values = mixed_float_frame[["A", "B", "C"]].values
assert values.dtype == np.float32
values = mixed_float_frame[["C"]].values
assert values.dtype == np.float16
# GH 10364
# B uint64 forces float because there are other signed int types
values = mixed_int_frame[["A", "B", "C", "D"]].values
assert values.dtype == np.float64
values = mixed_int_frame[["A", "D"]].values
assert values.dtype == np.int64
# B uint64 forces float because there are other signed int types
values = mixed_int_frame[["A", "B", "C"]].values
assert values.dtype == np.float64
# as B and C are both unsigned, no forcing to float is needed
values = mixed_int_frame[["B", "C"]].values
assert values.dtype == np.uint64
values = mixed_int_frame[["A", "C"]].values
assert values.dtype == np.int32
values = mixed_int_frame[["C", "D"]].values
assert values.dtype == np.int64
values = mixed_int_frame[["A"]].values
assert values.dtype == np.int32
values = mixed_int_frame[["C"]].values
assert values.dtype == np.uint8
def test_constructor_with_convert(self):
# this is actually mostly a test of lib.maybe_convert_objects
# #2845
df = DataFrame({"A": [2 ** 63 - 1]})
result = df["A"]
expected = Series(np.asarray([2 ** 63 - 1], np.int64), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [2 ** 63]})
result = df["A"]
expected = Series(np.asarray([2 ** 63], np.uint64), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [datetime(2005, 1, 1), True]})
result = df["A"]
expected = Series(
np.asarray([datetime(2005, 1, 1), True], np.object_), name="A"
)
assert_series_equal(result, expected)
df = DataFrame({"A": [None, 1]})
result = df["A"]
expected = Series(np.asarray([np.nan, 1], np.float_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [1.0, 2]})
result = df["A"]
expected = Series(np.asarray([1.0, 2], np.float_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [1.0 + 2.0j, 3]})
result = df["A"]
expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [1.0 + 2.0j, 3.0]})
result = df["A"]
expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [1.0 + 2.0j, True]})
result = df["A"]
expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [1.0, None]})
result = df["A"]
expected = Series(np.asarray([1.0, np.nan], np.float_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [1.0 + 2.0j, None]})
result = df["A"]
expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [2.0, 1, True, None]})
result = df["A"]
expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A")
assert_series_equal(result, expected)
df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]})
result = df["A"]
expected = Series(
np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A"
)
assert_series_equal(result, expected)
def test_construction_with_mixed(self, float_string_frame):
# test construction edge cases with mixed types
# f7u12, this does not work without extensive workaround
data = [
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
]
df = DataFrame(data)
# check dtypes
result = df.dtypes
expected = Series({"datetime64[ns]": 3})
# mixed-type frames
float_string_frame["datetime"] = datetime.now()
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
assert float_string_frame["datetime"].dtype == "M8[ns]"
assert float_string_frame["timedelta"].dtype == "m8[ns]"
result = float_string_frame.dtypes
expected = Series(
[np.dtype("float64")] * 4
+ [
np.dtype("object"),
np.dtype("datetime64[ns]"),
np.dtype("timedelta64[ns]"),
],
index=list("ABCD") + ["foo", "datetime", "timedelta"],
)
assert_series_equal(result, expected)
def test_construction_with_conversions(self):
# convert from a numpy array of non-ns timedelta64
arr = np.array([1, 2, 3], dtype="timedelta64[s]")
df = DataFrame(index=range(3))
df["A"] = arr
expected = DataFrame(
{"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
)
assert_frame_equal(df, expected)
expected = DataFrame(
{
"dt1": Timestamp("20130101"),
"dt2": date_range("20130101", periods=3),
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
},
index=range(3),
)
df = DataFrame(index=range(3))
df["dt1"] = np.datetime64("2013-01-01")
df["dt2"] = np.array(
["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]"
)
# df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
assert_frame_equal(df, expected)
def test_constructor_compound_dtypes(self):
# GH 5191
# compound dtypes should raise not-implementederror
def f(dtype):
data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9))
return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype)
msg = "compound dtypes are not implemented in the DataFrame constructor"
with pytest.raises(NotImplementedError, match=msg):
f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
# these work (though results may be unexpected)
f("int64")
f("float64")
# 10822
# invalid error message on dt inference
if not compat.is_platform_windows():
f("M8[ns]")
def test_equals_different_blocks(self):
# GH 9330
df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
# this assert verifies that the above operations have
# induced a block rearrangement
assert df0._data.blocks[0].dtype != df1._data.blocks[0].dtype
# do the real tests
assert_frame_equal(df0, df1)
assert df0.equals(df1)
assert df1.equals(df0)
def test_copy_blocks(self, float_frame):
# API/ENH 9607
df = DataFrame(float_frame, copy=True)
column = df.columns[0]
# use the default copy=True, change a column
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
blocks = df.as_blocks()
for dtype, _df in blocks.items():
if column in _df:
_df.loc[:, column] = _df[column] + 1
# make sure we did not change the original DataFrame
assert not _df[column].equals(df[column])
def test_no_copy_blocks(self, float_frame):
# API/ENH 9607
df = DataFrame(float_frame, copy=True)
column = df.columns[0]
# use the copy=False, change a column
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
blocks = df.as_blocks(copy=False)
for dtype, _df in blocks.items():
if column in _df:
_df.loc[:, column] = _df[column] + 1
# make sure we did change the original DataFrame
assert _df[column].equals(df[column])
def test_copy(self, float_frame, float_string_frame):
cop = float_frame.copy()
cop["E"] = cop["A"]
assert "E" not in float_frame
# copy objects
copy = float_string_frame.copy()
assert copy._data is not float_string_frame._data
def test_pickle(self, float_string_frame, timezone_frame):
empty_frame = DataFrame()
unpickled = tm.round_trip_pickle(float_string_frame)
assert_frame_equal(float_string_frame, unpickled)
# buglet
float_string_frame._data.ndim
# empty
unpickled = tm.round_trip_pickle(empty_frame)
repr(unpickled)
# tz frame
unpickled = tm.round_trip_pickle(timezone_frame)
assert_frame_equal(timezone_frame, unpickled)
def test_consolidate_datetime64(self):
# numpy vstack bug
data = """\
starting,ending,measure
2012-06-21 00:00,2012-06-23 07:00,77
2012-06-23 07:00,2012-06-23 16:30,65
2012-06-23 16:30,2012-06-25 08:00,77
2012-06-25 08:00,2012-06-26 12:00,0
2012-06-26 12:00,2012-06-27 08:00,77
"""
df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
ser_starting = df.starting
ser_starting.index = ser_starting.values
ser_starting = ser_starting.tz_localize("US/Eastern")
ser_starting = ser_starting.tz_convert("UTC")
ser_starting.index.name = "starting"
ser_ending = df.ending
ser_ending.index = ser_ending.values
ser_ending = ser_ending.tz_localize("US/Eastern")
ser_ending = ser_ending.tz_convert("UTC")
ser_ending.index.name = "ending"
df.starting = ser_starting.index
df.ending = ser_ending.index
tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index)
tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
def test_is_mixed_type(self, float_frame, float_string_frame):
assert not float_frame._is_mixed_type
assert float_string_frame._is_mixed_type
def test_get_numeric_data(self):
# TODO(wesm): unused?
intname = np.dtype(np.int_).name # noqa
floatname = np.dtype(np.float_).name # noqa
datetime64name = np.dtype("M8[ns]").name
objectname = np.dtype(np.object_).name
df = DataFrame(
{"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")},
index=np.arange(10),
)
result = df.dtypes
expected = Series(
[
np.dtype("float64"),
np.dtype("int64"),
np.dtype(objectname),
np.dtype(datetime64name),
],
index=["a", "b", "c", "f"],
)
assert_series_equal(result, expected)
df = DataFrame(
{
"a": 1.0,
"b": 2,
"c": "foo",
"d": np.array([1.0] * 10, dtype="float32"),
"e": np.array([1] * 10, dtype="int32"),
"f": np.array([1] * 10, dtype="int16"),
"g": Timestamp("20010102"),
},
index=np.arange(10),
)
result = df._get_numeric_data()
expected = df.loc[:, ["a", "b", "d", "e", "f"]]
assert_frame_equal(result, expected)
only_obj = df.loc[:, ["c", "g"]]
result = only_obj._get_numeric_data()
expected = df.loc[:, []]
assert_frame_equal(result, expected)
df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]})
result = df._get_numeric_data()
expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]})
assert_frame_equal(result, expected)
df = result.copy()
result = df._get_numeric_data()
expected = df
assert_frame_equal(result, expected)
def test_get_numeric_data_extension_dtype(self):
# GH 22290
df = DataFrame(
{
"A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"),
"B": Categorical(list("abcabc")),
"C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"),
"D": IntervalArray.from_breaks(range(7)),
}
)
result = df._get_numeric_data()
expected = df.loc[:, ["A", "C"]]
assert_frame_equal(result, expected)
def test_convert_objects(self, float_string_frame):
oops = float_string_frame.T.T
converted = oops._convert(datetime=True)
assert_frame_equal(converted, float_string_frame)
assert converted["A"].dtype == np.float64
# force numeric conversion
float_string_frame["H"] = "1."
float_string_frame["I"] = "1"
# add in some items that will be nan
length = len(float_string_frame)
float_string_frame["J"] = "1."
float_string_frame["K"] = "1"
float_string_frame.loc[0:5, ["J", "K"]] = "garbled"
converted = float_string_frame._convert(datetime=True, numeric=True)
assert converted["H"].dtype == "float64"
assert converted["I"].dtype == "int64"
assert converted["J"].dtype == "float64"
assert converted["K"].dtype == "float64"
assert len(converted["J"].dropna()) == length - 5
assert len(converted["K"].dropna()) == length - 5
# via astype
converted = float_string_frame.copy()
converted["H"] = converted["H"].astype("float64")
converted["I"] = converted["I"].astype("int64")
assert converted["H"].dtype == "float64"
assert converted["I"].dtype == "int64"
# via astype, but errors
converted = float_string_frame.copy()
with pytest.raises(ValueError, match="invalid literal"):
converted["H"].astype("int32")
# mixed in a single column
df = DataFrame(dict(s=Series([1, "na", 3, 4])))
result = df._convert(datetime=True, numeric=True)
expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
assert_frame_equal(result, expected)
def test_convert_objects_no_conversion(self):
mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]})
mixed2 = mixed1._convert(datetime=True)
assert_frame_equal(mixed1, mixed2)
def test_infer_objects(self):
# GH 11221
df = DataFrame(
{
"a": ["a", 1, 2, 3],
"b": ["b", 2.0, 3.0, 4.1],
"c": [
"c",
datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3),
],
"d": [1, 2, 3, "d"],
},
columns=["a", "b", "c", "d"],
)
df = df.iloc[1:].infer_objects()
assert df["a"].dtype == "int64"
assert df["b"].dtype == "float64"
assert df["c"].dtype == "M8[ns]"
assert df["d"].dtype == "object"
expected = DataFrame(
{
"a": [1, 2, 3],
"b": [2.0, 3.0, 4.1],
"c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)],
"d": [2, 3, "d"],
},
columns=["a", "b", "c", "d"],
)
# reconstruct frame to verify inference is same
tm.assert_frame_equal(df.reset_index(drop=True), expected)
def test_stale_cached_series_bug_473(self):
# this is chained, but ok
with option_context("chained_assignment", None):
Y = DataFrame(
np.random.random((4, 4)),
index=("a", "b", "c", "d"),
columns=("e", "f", "g", "h"),
)
repr(Y)
Y["e"] = Y["e"].astype("object")
Y["g"]["c"] = np.NaN
repr(Y)
result = Y.sum() # noqa
exp = Y["g"].sum() # noqa
assert pd.isna(Y["g"]["c"])
def test_get_X_columns(self):
# numeric and object columns
df = DataFrame(
{
"a": [1, 2, 3],
"b": [True, False, True],
"c": ["foo", "bar", "baz"],
"d": [None, None, None],
"e": [3.14, 0.577, 2.773],
}
)
tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"]))
def test_strange_column_corruption_issue(self):
# (wesm) Unclear how exactly this is related to internal matters
df = DataFrame(index=[0, 1])
df[0] = np.nan
wasCol = {}
# uncommenting these makes the results match
# for col in xrange(100, 200):
# wasCol[col] = 1
# df[col] = np.nan
for i, dt in enumerate(df.index):
for col in range(100, 200):
if col not in wasCol:
wasCol[col] = 1
df[col] = np.nan
df[col][dt] = i
myid = 100
first = len(df.loc[pd.isna(df[myid]), [myid]])
second = len(df.loc[pd.isna(df[myid]), [myid]])
assert first == second == 0
def test_constructor_no_pandas_array(self):
# Ensure that PandasArray isn't allowed inside Series
# See https://github.com/pandas-dev/pandas/issues/23995 for more.
arr = pd.Series([1, 2, 3]).array
result = pd.DataFrame({"A": arr})
expected = pd.DataFrame({"A": [1, 2, 3]})
tm.assert_frame_equal(result, expected)
assert isinstance(result._data.blocks[0], IntBlock)
def test_add_column_with_pandas_array(self):
# GH 26390
df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
df["c"] = pd.array([1, 2, None, 3])
df2 = pd.DataFrame(
{
"a": [1, 2, 3, 4],
"b": ["a", "b", "c", "d"],
"c": pd.array([1, 2, None, 3]),
}
)
assert type(df["c"]._data.blocks[0]) == ObjectBlock
assert type(df2["c"]._data.blocks[0]) == ObjectBlock
assert_frame_equal(df, df2)

View File

@@ -0,0 +1,952 @@
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, Series, Timestamp, date_range
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameConcatCommon:
def test_concat_multiple_frames_dtypes(self):
# GH 2759
A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
results = pd.concat((A, B), axis=1).dtypes
expected = Series(
[np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
index=["foo", "bar", 0, 1],
)
assert_series_equal(results, expected)
@pytest.mark.parametrize(
"data",
[
pd.date_range("2000", periods=4),
pd.date_range("2000", periods=4, tz="US/Central"),
pd.period_range("2000", periods=4),
pd.timedelta_range(0, periods=4),
],
)
def test_combine_datetlike_udf(self, data):
# https://github.com/pandas-dev/pandas/issues/23079
df = pd.DataFrame({"A": data})
other = df.copy()
df.iloc[1, 0] = None
def combiner(a, b):
return b
result = df.combine(other, combiner)
tm.assert_frame_equal(result, other)
def test_concat_multiple_tzs(self):
# GH 12467
# combining datetime tz-aware and naive DataFrames
ts1 = Timestamp("2015-01-01", tz=None)
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="EST")
df1 = DataFrame(dict(time=[ts1]))
df2 = DataFrame(dict(time=[ts2]))
df3 = DataFrame(dict(time=[ts3]))
results = pd.concat([df1, df2]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
assert_frame_equal(results, expected)
results = pd.concat([df1, df3]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
assert_frame_equal(results, expected)
results = pd.concat([df2, df3]).reset_index(drop=True)
expected = DataFrame(dict(time=[ts2, ts3]))
assert_frame_equal(results, expected)
@pytest.mark.parametrize(
"t1",
[
"2015-01-01",
pytest.param(
pd.NaT,
marks=pytest.mark.xfail(
reason="GH23037 incorrect dtype when concatenating"
),
),
],
)
def test_concat_tz_NaT(self, t1):
# GH 22796
# Concating tz-aware multicolumn DataFrames
ts1 = Timestamp(t1, tz="UTC")
ts2 = Timestamp("2015-01-01", tz="UTC")
ts3 = Timestamp("2015-01-01", tz="UTC")
df1 = DataFrame([[ts1, ts2]])
df2 = DataFrame([[ts3]])
result = pd.concat([df1, df2])
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
assert_frame_equal(result, expected)
def test_concat_tz_not_aligned(self):
# GH 22796
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
a = pd.DataFrame({"A": ts})
b = pd.DataFrame({"A": ts, "B": ts})
result = pd.concat([a, b], sort=True, ignore_index=True)
expected = pd.DataFrame(
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
)
assert_frame_equal(result, expected)
def test_concat_tuple_keys(self):
# GH 14438
df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB"))
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
expected = pd.DataFrame(
{
"A": {
("bee", "bah", 0): 1.0,
("bee", "bah", 1): 1.0,
("bee", "boo", 0): 2.0,
("bee", "boo", 1): 2.0,
("bee", "boo", 2): 2.0,
},
"B": {
("bee", "bah", 0): 1.0,
("bee", "bah", 1): 1.0,
("bee", "boo", 0): 2.0,
("bee", "boo", 1): 2.0,
("bee", "boo", 2): 2.0,
},
}
)
assert_frame_equal(results, expected)
def test_append_series_dict(self):
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
series = df.loc[4]
msg = "Indexes have overlapping values"
with pytest.raises(ValueError, match=msg):
df.append(series, verify_integrity=True)
series.name = None
msg = "Can only append a Series if ignore_index=True"
with pytest.raises(TypeError, match=msg):
df.append(series, verify_integrity=True)
result = df.append(series[::-1], ignore_index=True)
expected = df.append(
DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True
)
assert_frame_equal(result, expected)
# dict
result = df.append(series.to_dict(), ignore_index=True)
assert_frame_equal(result, expected)
result = df.append(series[::-1][:3], ignore_index=True)
expected = df.append(
DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True
)
assert_frame_equal(result, expected.loc[:, result.columns])
# can append when name set
row = df.loc[4]
row.name = 5
result = df.append(row)
expected = df.append(df[-1:], ignore_index=True)
assert_frame_equal(result, expected)
def test_append_list_of_series_dicts(self):
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
dicts = [x.to_dict() for idx, x in df.iterrows()]
result = df.append(dicts, ignore_index=True)
expected = df.append(df, ignore_index=True)
assert_frame_equal(result, expected)
# different columns
dicts = [
{"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4},
{"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8},
]
result = df.append(dicts, ignore_index=True, sort=True)
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
assert_frame_equal(result, expected)
def test_append_missing_cols(self):
# GH22252
# exercise the conditional branch in append method where the data
# to be appended is a list and does not contain all columns that are in
# the target DataFrame
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
dicts = [{"foo": 9}, {"bar": 10}]
with tm.assert_produces_warning(None):
result = df.append(dicts, ignore_index=True, sort=True)
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
assert_frame_equal(result, expected)
def test_append_empty_dataframe(self):
# Empty df append empty df
df1 = DataFrame()
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Non-empty df append empty df
df1 = DataFrame(np.random.randn(5, 2))
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Empty df with columns append empty df
df1 = DataFrame(columns=["bar", "foo"])
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
# Non-Empty df with columns append empty df
df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"])
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
def test_append_dtypes(self):
# GH 5754
# row appends of different dtypes (so need to do by-item)
# can sometimes infer the correct type
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5))
df2 = DataFrame()
result = df1.append(df2)
expected = df1.copy()
assert_frame_equal(result, expected)
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
df2 = DataFrame({"bar": "foo"}, index=range(1, 2))
result = df1.append(df2)
expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]})
assert_frame_equal(result, expected)
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
df2 = DataFrame({"bar": np.nan}, index=range(1, 2))
result = df1.append(df2)
expected = DataFrame(
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
)
assert_frame_equal(result, expected)
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object)
result = df1.append(df2)
expected = DataFrame(
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
)
assert_frame_equal(result, expected)
df1 = DataFrame({"bar": np.nan}, index=range(1))
df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2))
result = df1.append(df2)
expected = DataFrame(
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
)
assert_frame_equal(result, expected)
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object)
result = df1.append(df2)
expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])})
assert_frame_equal(result, expected)
def test_update(self):
df = DataFrame(
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other)
expected = DataFrame(
[[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
)
assert_frame_equal(df, expected)
def test_update_dtypes(self):
# gh 3016
df = DataFrame(
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
columns=["A", "B", "bool1", "bool2"],
)
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
df.update(other)
expected = DataFrame(
[[45.0, 45.0, False, True], [4.0, 5.0, True, False]],
columns=["A", "B", "bool1", "bool2"],
)
assert_frame_equal(df, expected)
def test_update_nooverwrite(self):
df = DataFrame(
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other, overwrite=False)
expected = DataFrame(
[[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
)
assert_frame_equal(df, expected)
def test_update_filtered(self):
df = DataFrame(
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other, filter_func=lambda x: x > 2)
expected = DataFrame(
[[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
)
assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"bad_kwarg, exception, msg",
[
# errors must be 'ignore' or 'raise'
({"errors": "something"}, ValueError, "The parameter errors must.*"),
({"join": "inner"}, NotImplementedError, "Only left join is supported"),
],
)
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
df = DataFrame([[1.5, 1, 3.0]])
with pytest.raises(exception, match=msg):
df.update(df, **bad_kwarg)
def test_update_raise_on_overlap(self):
df = DataFrame(
[[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
)
other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
with pytest.raises(ValueError, match="Data overlaps"):
df.update(other, errors="raise")
@pytest.mark.parametrize("raise_conflict", [True, False])
def test_update_deprecation(self, raise_conflict):
df = DataFrame([[1.5, 1, 3.0]])
other = DataFrame()
with tm.assert_produces_warning(FutureWarning):
df.update(other, raise_conflict=raise_conflict)
def test_update_from_non_df(self):
d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
df = DataFrame(d)
d["a"] = Series([5, 6, 7, 8])
df.update(d)
expected = DataFrame(d)
assert_frame_equal(df, expected)
d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
df = DataFrame(d)
d["a"] = [5, 6, 7, 8]
df.update(d)
expected = DataFrame(d)
assert_frame_equal(df, expected)
def test_update_datetime_tz(self):
# GH 25807
result = DataFrame([pd.Timestamp("2019", tz="UTC")])
result.update(result)
expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
assert_frame_equal(result, expected)
def test_join_str_datetime(self):
str_dates = ["20120209", "20120222"]
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
A = DataFrame(str_dates, index=range(2), columns=["aa"])
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
tst = A.join(C, on="aa")
assert len(tst.columns) == 3
def test_join_multiindex_leftright(self):
# GH 10741
df1 = pd.DataFrame(
[
["a", "x", 0.471780],
["a", "y", 0.774908],
["a", "z", 0.563634],
["b", "x", -0.353756],
["b", "y", 0.368062],
["b", "z", -1.721840],
["c", "x", 1],
["c", "y", 2],
["c", "z", 3],
],
columns=["first", "second", "value1"],
).set_index(["first", "second"])
df2 = pd.DataFrame(
[["a", 10], ["b", 20]], columns=["first", "value2"]
).set_index(["first"])
exp = pd.DataFrame(
[
[0.471780, 10],
[0.774908, 10],
[0.563634, 10],
[-0.353756, 20],
[0.368062, 20],
[-1.721840, 20],
[1.000000, np.nan],
[2.000000, np.nan],
[3.000000, np.nan],
],
index=df1.index,
columns=["value1", "value2"],
)
# these must be the same results (but columns are flipped)
assert_frame_equal(df1.join(df2, how="left"), exp)
assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]])
exp_idx = pd.MultiIndex.from_product(
[["a", "b"], ["x", "y", "z"]], names=["first", "second"]
)
exp = pd.DataFrame(
[
[0.471780, 10],
[0.774908, 10],
[0.563634, 10],
[-0.353756, 20],
[0.368062, 20],
[-1.721840, 20],
],
index=exp_idx,
columns=["value1", "value2"],
)
assert_frame_equal(df1.join(df2, how="right"), exp)
assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]])
def test_concat_named_keys(self):
# GH 14252
df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
index = Index(["a", "b"], name="baz")
concatted_named_from_keys = pd.concat([df, df], keys=index)
expected_named = pd.DataFrame(
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
)
assert_frame_equal(concatted_named_from_keys, expected_named)
index_no_name = Index(["a", "b"], name=None)
concatted_named_from_names = pd.concat(
[df, df], keys=index_no_name, names=["baz"]
)
assert_frame_equal(concatted_named_from_names, expected_named)
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
expected_unnamed = pd.DataFrame(
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
)
assert_frame_equal(concatted_unnamed, expected_unnamed)
def test_concat_axis_parameter(self):
# GH 14369
df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2))
df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2))
# Index/row/0 DataFrame
expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
concatted_index = pd.concat([df1, df2], axis="index")
assert_frame_equal(concatted_index, expected_index)
concatted_row = pd.concat([df1, df2], axis="rows")
assert_frame_equal(concatted_row, expected_index)
concatted_0 = pd.concat([df1, df2], axis=0)
assert_frame_equal(concatted_0, expected_index)
# Columns/1 DataFrame
expected_columns = pd.DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
)
concatted_columns = pd.concat([df1, df2], axis="columns")
assert_frame_equal(concatted_columns, expected_columns)
concatted_1 = pd.concat([df1, df2], axis=1)
assert_frame_equal(concatted_1, expected_columns)
series1 = pd.Series([0.1, 0.2])
series2 = pd.Series([0.3, 0.4])
# Index/row/0 Series
expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
concatted_index_series = pd.concat([series1, series2], axis="index")
assert_series_equal(concatted_index_series, expected_index_series)
concatted_row_series = pd.concat([series1, series2], axis="rows")
assert_series_equal(concatted_row_series, expected_index_series)
concatted_0_series = pd.concat([series1, series2], axis=0)
assert_series_equal(concatted_0_series, expected_index_series)
# Columns/1 Series
expected_columns_series = pd.DataFrame(
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
)
concatted_columns_series = pd.concat([series1, series2], axis="columns")
assert_frame_equal(concatted_columns_series, expected_columns_series)
concatted_1_series = pd.concat([series1, series2], axis=1)
assert_frame_equal(concatted_1_series, expected_columns_series)
# Testing ValueError
with pytest.raises(ValueError, match="No axis named"):
pd.concat([series1, series2], axis="something")
def test_concat_numerical_names(self):
# #15262 # #12223
df = pd.DataFrame(
{"col": range(9)},
dtype="int32",
index=(
pd.MultiIndex.from_product(
[["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
)
),
)
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
expected = pd.DataFrame(
{"col": [0, 1, 7, 8]},
dtype="int32",
index=pd.MultiIndex.from_tuples(
[("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
),
)
tm.assert_frame_equal(result, expected)
def test_concat_astype_dup_col(self):
# gh 23049
df = pd.DataFrame([{"a": "b"}])
df = pd.concat([df, df], axis=1)
result = df.astype("category")
expected = pd.DataFrame(
np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
).astype("category")
tm.assert_frame_equal(result, expected)
class TestDataFrameCombineFirst:
def test_combine_first_mixed(self):
a = Series(["a", "b"], index=range(2))
b = Series(range(2), index=range(2))
f = DataFrame({"A": a, "B": b})
a = Series(["a", "b"], index=range(5, 7))
b = Series(range(2), index=range(5, 7))
g = DataFrame({"A": a, "B": b})
exp = pd.DataFrame(
{"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6]
)
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)
def test_combine_first(self, float_frame):
# disjoint
head, tail = float_frame[:5], float_frame[5:]
combined = head.combine_first(tail)
reordered_frame = float_frame.reindex(combined.index)
assert_frame_equal(combined, reordered_frame)
assert tm.equalContents(combined.columns, float_frame.columns)
assert_series_equal(combined["A"], reordered_frame["A"])
# same index
fcopy = float_frame.copy()
fcopy["A"] = 1
del fcopy["C"]
fcopy2 = float_frame.copy()
fcopy2["B"] = 0
del fcopy2["D"]
combined = fcopy.combine_first(fcopy2)
assert (combined["A"] == 1).all()
assert_series_equal(combined["B"], fcopy["B"])
assert_series_equal(combined["C"], fcopy2["C"])
assert_series_equal(combined["D"], fcopy["D"])
# overlap
head, tail = reordered_frame[:10].copy(), reordered_frame
head["A"] = 1
combined = head.combine_first(tail)
assert (combined["A"][:10] == 1).all()
# reverse overlap
tail["A"][:10] = 0
combined = tail.combine_first(head)
assert (combined["A"][:10] == 0).all()
# no overlap
f = float_frame[:10]
g = float_frame[10:]
combined = f.combine_first(g)
assert_series_equal(combined["A"].reindex(f.index), f["A"])
assert_series_equal(combined["A"].reindex(g.index), g["A"])
# corner cases
comb = float_frame.combine_first(DataFrame())
assert_frame_equal(comb, float_frame)
comb = DataFrame().combine_first(float_frame)
assert_frame_equal(comb, float_frame)
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
assert "faz" in comb.index
# #2525
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
df2 = DataFrame(columns=["b"])
result = df.combine_first(df2)
assert "b" in result
def test_combine_first_mixed_bug(self):
idx = Index(["a", "b", "c", "e"])
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
ser2 = Series(["a", "b", "c", "e"], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
idx = Index(["a", "b", "c", "f"])
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
ser2 = Series(["a", "b", "c", "f"], index=idx)
ser3 = Series([12, 4, 5, 97], index=idx)
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5
# gh 3016 (same as in update)
df = DataFrame(
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
columns=["A", "B", "bool1", "bool2"],
)
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
result = df.combine_first(other)
assert_frame_equal(result, df)
df.loc[0, "A"] = np.nan
result = df.combine_first(other)
df.loc[0, "A"] = 45
assert_frame_equal(result, df)
# doc example
df1 = DataFrame(
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)
df2 = DataFrame(
{
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
}
)
result = df1.combine_first(df2)
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
assert_frame_equal(result, expected)
# GH3552, return object dtype with bools
df1 = DataFrame(
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
)
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
result = df1.combine_first(df2)[2]
expected = Series([True, True, False], name=2)
assert_series_equal(result, expected)
# GH 3593, converting datetime64[ns] incorrectly
df0 = DataFrame(
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
)
df1 = DataFrame({"a": [None, None, None]})
df2 = df1.combine_first(df0)
assert_frame_equal(df2, df0)
df2 = df0.combine_first(df1)
assert_frame_equal(df2, df0)
df0 = DataFrame(
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
)
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
df2 = df1.combine_first(df0)
result = df0.copy()
result.iloc[0, :] = df1.iloc[0, :]
assert_frame_equal(df2, result)
df2 = df0.combine_first(df1)
assert_frame_equal(df2, df0)
def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
dfb = pd.DataFrame([[4], [5]], columns=["b"])
assert dfa["a"].dtype == "datetime64[ns]"
assert dfa["b"].dtype == "int64"
res = dfa.combine_first(dfb)
exp = pd.DataFrame(
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]},
columns=["a", "b"],
)
tm.assert_frame_equal(res, exp)
assert res["a"].dtype == "datetime64[ns]"
# ToDo: this must be int64
assert res["b"].dtype == "float64"
res = dfa.iloc[:0].combine_first(dfb)
exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
tm.assert_frame_equal(res, exp)
# ToDo: this must be datetime64
assert res["a"].dtype == "float64"
# ToDo: this must be int64
assert res["b"].dtype == "int64"
def test_combine_first_timezone(self):
# see gh-7630
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC")
df1 = pd.DataFrame(
columns=["UTCdatetime", "abc"],
data=data1,
index=pd.date_range("20140627", periods=1),
)
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
df2 = pd.DataFrame(
columns=["UTCdatetime", "xyz"],
data=data2,
index=pd.date_range("20140628", periods=1),
)
res = df2[["UTCdatetime"]].combine_first(df1)
exp = pd.DataFrame(
{
"UTCdatetime": [
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
],
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
},
columns=["UTCdatetime", "abc"],
index=pd.date_range("20140627", periods=2, freq="D"),
)
tm.assert_frame_equal(res, exp)
assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
assert res["abc"].dtype == "datetime64[ns, UTC]"
# see gh-10567
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
df1 = pd.DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC")
df2 = pd.DataFrame({"DATE": dts2})
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["DATE"].dtype == "datetime64[ns, UTC]"
dts1 = pd.DatetimeIndex(
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
)
df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
dts2 = pd.DatetimeIndex(
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
)
df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.DatetimeIndex(
[
"2011-01-01",
"2012-01-01",
"NaT",
"2012-01-02",
"2011-01-03",
"2011-01-04",
],
tz="US/Eastern",
)
exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
# different tz
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
df1 = pd.DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-03", "2015-01-05")
df2 = pd.DataFrame({"DATE": dts2})
# if df1 doesn't have NaN, keep its dtype
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern")
df1 = pd.DataFrame({"DATE": dts1})
dts2 = pd.date_range("2015-01-01", "2015-01-03")
df2 = pd.DataFrame({"DATE": dts2})
res = df1.combine_first(df2)
exp_dts = [
pd.Timestamp("2015-01-01", tz="US/Eastern"),
pd.Timestamp("2015-01-02", tz="US/Eastern"),
pd.Timestamp("2015-01-03"),
]
exp = pd.DataFrame({"DATE": exp_dts})
tm.assert_frame_equal(res, exp)
assert res["DATE"].dtype == "object"
def test_combine_first_timedelta(self):
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7])
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.TimedeltaIndex(
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
)
exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["TD"].dtype == "timedelta64[ns]"
def test_combine_first_period(self):
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7])
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = pd.PeriodIndex(
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
)
exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["P"].dtype == data1.dtype
# different freq
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5])
res = df1.combine_first(df2)
exp_dts = [
pd.Period("2011-01", freq="M"),
pd.Period("2012-01-01", freq="D"),
pd.NaT,
pd.Period("2012-01-02", freq="D"),
pd.Period("2011-03", freq="M"),
pd.Period("2011-04", freq="M"),
]
exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
tm.assert_frame_equal(res, exp)
assert res["P"].dtype == "object"
def test_combine_first_int(self):
# GH14687 - integer series that do no align exactly
df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64")
res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["a"].dtype == "int64"
@pytest.mark.parametrize("val", [1, 1.0])
def test_combine_first_with_asymmetric_other(self, val):
# see gh-20699
df1 = pd.DataFrame({"isNum": [val]})
df2 = pd.DataFrame({"isBool": [True]})
res = df1.combine_first(df2)
exp = pd.DataFrame({"isBool": [True], "isNum": [val]})
tm.assert_frame_equal(res, exp)
def test_concat_datetime_datetime64_frame(self):
# #2624
rows = []
rows.append([datetime(2010, 1, 1), 1])
rows.append([datetime(2010, 1, 2), "hi"])
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
ind = date_range(start="2000/1/1", freq="D", periods=10)
df1 = DataFrame({"date": ind, "test": range(10)})
# it works!
pd.concat([df1, df2_obj])
class TestDataFrameUpdate:
def test_update_nan(self):
# #15593 #15617
# test 1
df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
df2 = DataFrame({"A": [None, 2, 3]})
expected = df1.copy()
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)
# test 2
df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
df2 = DataFrame({"A": [None, 2, 3]})
expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
df1.update(df2, overwrite=False)
tm.assert_frame_equal(df1, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,615 @@
from collections import OrderedDict, abc, defaultdict
from datetime import datetime
import numpy as np
import pytest
import pytz
from pandas import (
CategoricalDtype,
DataFrame,
MultiIndex,
Series,
Timestamp,
date_range,
)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
class TestDataFrameConvertTo(TestData):
def test_to_dict_timestamp(self):
# GH11247
# split/records producing np.datetime64 rather than Timestamps
# on datetime64[ns] dtypes only
tsmp = Timestamp("20130101")
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
assert test_data.to_dict(orient="records") == expected_records
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
expected_series = {
"A": Series([tsmp, tsmp], name="A"),
"B": Series([tsmp, tsmp], name="B"),
}
expected_series_mixed = {
"A": Series([tsmp, tsmp], name="A"),
"B": Series([1, 2], name="B"),
}
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
tm.assert_dict_equal(
test_data_mixed.to_dict(orient="series"), expected_series_mixed
)
expected_split = {
"index": [0, 1],
"data": [[tsmp, tsmp], [tsmp, tsmp]],
"columns": ["A", "B"],
}
expected_split_mixed = {
"index": [0, 1],
"data": [[tsmp, 1], [tsmp, 2]],
"columns": ["A", "B"],
}
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
tm.assert_dict_equal(
test_data_mixed.to_dict(orient="split"), expected_split_mixed
)
def test_to_dict_index_not_unique_with_index_orient(self):
# GH22801
# Data loss when indexes are not unique. Raise ValueError.
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
msg = "DataFrame index must be unique for orient='index'"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient="index")
def test_to_dict_invalid_orient(self):
df = DataFrame({"A": [0, 1]})
msg = "orient 'xinvalid' not understood"
with pytest.raises(ValueError, match=msg):
df.to_dict(orient="xinvalid")
def test_to_records_dt64(self):
df = DataFrame(
[["one", "two", "three"], ["four", "five", "six"]],
index=date_range("2012-01-01", "2012-01-02"),
)
# convert_datetime64 defaults to None
expected = df.index.values[0]
result = df.to_records()["index"][0]
assert expected == result
# check for FutureWarning if convert_datetime64=False is passed
with tm.assert_produces_warning(FutureWarning):
expected = df.index.values[0]
result = df.to_records(convert_datetime64=False)["index"][0]
assert expected == result
# check for FutureWarning if convert_datetime64=True is passed
with tm.assert_produces_warning(FutureWarning):
expected = df.index[0]
result = df.to_records(convert_datetime64=True)["index"][0]
assert expected == result
def test_to_records_with_multindex(self):
# GH3189
index = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
data = np.zeros((8, 4))
df = DataFrame(data, index=index)
r = df.to_records(index=True)["level_0"]
assert "bar" in r
assert "one" not in r
def test_to_records_with_Mapping_type(self):
import email
from email.parser import Parser
abc.Mapping.register(email.message.Message)
headers = Parser().parsestr(
"From: <user@example.com>\n"
"To: <someone_else@example.com>\n"
"Subject: Test message\n"
"\n"
"Body would go here\n"
)
frame = DataFrame.from_records([headers])
all(x in frame for x in ["Type", "Subject", "From"])
def test_to_records_floats(self):
df = DataFrame(np.random.rand(10, 10))
df.to_records()
def test_to_records_index_name(self):
df = DataFrame(np.random.randn(3, 3))
df.index.name = "X"
rs = df.to_records()
assert "X" in rs.dtype.fields
df = DataFrame(np.random.randn(3, 3))
rs = df.to_records()
assert "index" in rs.dtype.fields
df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
df.index.names = ["A", None]
rs = df.to_records()
assert "level_0" in rs.dtype.fields
def test_to_records_with_unicode_index(self):
# GH13172
# unicode_literals conflict with to_records
result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
tm.assert_almost_equal(result, expected)
def test_to_records_with_unicode_column_names(self):
# xref issue: https://github.com/numpy/numpy/issues/2407
# Issue #11879. to_records used to raise an exception when used
# with column names containing non-ascii characters in Python 2
result = DataFrame(data={"accented_name_é": [1.0]}).to_records()
# Note that numpy allows for unicode field names but dtypes need
# to be specified using dictionary instead of list of tuples.
expected = np.rec.array(
[(0, 1.0)],
dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]},
)
tm.assert_almost_equal(result, expected)
def test_to_records_with_categorical(self):
# GH8626
# dict creation
df = DataFrame({"A": list("abc")}, dtype="category")
expected = Series(list("abc"), dtype="category", name="A")
tm.assert_series_equal(df["A"], expected)
# list-like creation
df = DataFrame(list("abc"), dtype="category")
expected = Series(list("abc"), dtype="category", name=0)
tm.assert_series_equal(df[0], expected)
# to record array
# this coerces
result = df.to_records()
expected = np.rec.array(
[(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]
)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
# No dtypes --> default to array dtypes.
(
dict(),
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
),
),
# Should have no effect in this case.
(
dict(index=True),
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
),
),
# Column dtype applied across the board. Index unaffected.
(
dict(column_dtypes="<U4"),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"), ("C", "<U4")],
),
),
# Index dtype applied across the board. Columns unaffected.
(
dict(index_dtypes="<U1"),
np.rec.array(
[("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
),
),
# Pass in a type instance.
(
dict(column_dtypes=np.unicode),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")],
),
),
# Pass in a dtype instance.
(
dict(column_dtypes=np.dtype("unicode")),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")],
),
),
# Pass in a dictionary (name-only).
(
dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "<U2")],
),
),
# Pass in a dictionary (indices-only).
(
dict(index_dtypes={0: "int16"}),
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
),
),
# Ignore index mappings if index is not True.
(
dict(index=False, index_dtypes="<U2"),
np.rec.array(
[(1, 0.2, "a"), (2, 1.5, "bc")],
dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
),
),
# Non-existent names / indices in mapping should not error.
(
dict(index_dtypes={0: "int16", "not-there": "float32"}),
np.rec.array(
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
),
),
# Names / indices not in mapping default to array dtype.
(
dict(column_dtypes={"A": np.int8, "B": np.float32}),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
),
),
# Names / indices not in dtype mapping default to array dtype.
(
dict(column_dtypes={"A": np.dtype("int8"), "B": np.dtype("float32")}),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
),
),
# Mixture of everything.
(
dict(column_dtypes={"A": np.int8, "B": np.float32}, index_dtypes="<U2"),
np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
),
),
# Invalid dype values.
(
dict(index=False, column_dtypes=list()),
(ValueError, "Invalid dtype \\[\\] specified for column A"),
),
(
dict(index=False, column_dtypes={"A": "int32", "B": 5}),
(ValueError, "Invalid dtype 5 specified for column B"),
),
# Numpy can't handle EA types, so check error is raised
(
dict(
index=False,
column_dtypes={"A": "int32", "B": CategoricalDtype(["a", "b"])},
),
(ValueError, "Invalid dtype category specified for column B"),
),
# Check that bad types raise
(
dict(index=False, column_dtypes={"A": "int32", "B": "foo"}),
(TypeError, 'data type "foo" not understood'),
),
],
)
def test_to_records_dtype(self, kwargs, expected):
# see gh-18146
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
if not isinstance(expected, np.recarray):
with pytest.raises(expected[0], match=expected[1]):
df.to_records(**kwargs)
else:
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize(
"df,kwargs,expected",
[
# MultiIndex in the index.
(
DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")
).set_index(["a", "b"]),
dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
np.rec.array(
[(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
),
),
# MultiIndex in the columns.
(
DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples(
[("a", "d"), ("b", "e"), ("c", "f")]
),
),
dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
np.rec.array(
[(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)],
dtype=[
("index", "<f4"),
("('a', 'd')", "<U1"),
("('b', 'e')", "<i8"),
("('c', 'f')", "<f4"),
],
),
),
# MultiIndex in both the columns and index.
(
DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=MultiIndex.from_tuples(
[("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")
),
index=MultiIndex.from_tuples(
[("d", -4), ("d", -5), ("f", -6)], names=list("cd")
),
),
dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
np.rec.array(
[
("d", -4, 1.0, 2.0, 3.0),
("d", -5, 4.0, 5.0, 6.0),
("f", -6, 7, 8, 9.0),
],
dtype=[
("c", "<U2"),
("d", "i1"),
("('a', 'd')", "<f8"),
("('b', 'e')", "<f8"),
("('c', 'f')", "<f8"),
],
),
),
],
)
def test_to_records_dtype_mi(self, df, kwargs, expected):
# see gh-18146
result = df.to_records(**kwargs)
tm.assert_almost_equal(result, expected)
def test_to_records_dict_like(self):
# see gh-18146
class DictLike:
def __init__(self, **kwargs):
self.d = kwargs.copy()
def __getitem__(self, key):
return self.d.__getitem__(key)
def __contains__(self, key):
return key in self.d
def keys(self):
return self.d.keys()
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
dtype_mappings = dict(
column_dtypes=DictLike(**{"A": np.int8, "B": np.float32}),
index_dtypes="<U2",
)
result = df.to_records(**dtype_mappings)
expected = np.rec.array(
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
def test_to_dict(self, mapping):
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
# GH16122
recons_data = DataFrame(test_data).to_dict(into=mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][k2]
recons_data = DataFrame(test_data).to_dict("l", mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][int(k2) - 1]
recons_data = DataFrame(test_data).to_dict("s", mapping)
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k][k2]
recons_data = DataFrame(test_data).to_dict("sp", mapping)
expected_split = {
"columns": ["A", "B"],
"index": ["1", "2", "3"],
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
}
tm.assert_dict_equal(recons_data, expected_split)
recons_data = DataFrame(test_data).to_dict("r", mapping)
expected_records = [
{"A": 1.0, "B": "1"},
{"A": 2.0, "B": "2"},
{"A": np.nan, "B": "3"},
]
assert isinstance(recons_data, list)
assert len(recons_data) == 3
for l, r in zip(recons_data, expected_records):
tm.assert_dict_equal(l, r)
# GH10844
recons_data = DataFrame(test_data).to_dict("i")
for k, v in test_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k2][k]
df = DataFrame(test_data)
df["duped"] = df[df.columns[0]]
recons_data = df.to_dict("i")
comp_data = test_data.copy()
comp_data["duped"] = comp_data[df.columns[0]]
for k, v in comp_data.items():
for k2, v2 in v.items():
assert v2 == recons_data[k2][k]
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
def test_to_dict_errors(self, mapping):
# GH16122
df = DataFrame(np.random.randn(3, 3))
with pytest.raises(TypeError):
df.to_dict(into=mapping)
def test_to_dict_not_unique_warning(self):
# GH16927: When converting to a dict, if a column has a non-unique name
# it will be dropped, throwing a warning.
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
with tm.assert_produces_warning(UserWarning):
df.to_dict()
@pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
def test_to_records_datetimeindex_with_tz(self, tz):
# GH13937
dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)
df = DataFrame({"datetime": dr}, index=dr)
expected = df.to_records()
result = df.tz_convert("UTC").to_records()
# both converted to UTC, so they are equal
tm.assert_numpy_array_equal(result, expected)
# orient - orient argument to to_dict function
# item_getter - function for extracting value from
# the resulting dict using column name and index
@pytest.mark.parametrize(
"orient,item_getter",
[
("dict", lambda d, col, idx: d[col][idx]),
("records", lambda d, col, idx: d[idx][col]),
("list", lambda d, col, idx: d[col][idx]),
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
("index", lambda d, col, idx: d[idx][col]),
],
)
def test_to_dict_box_scalars(self, orient, item_getter):
# 14216, 23753
# make sure that we are boxing properly
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
result = df.to_dict(orient=orient)
assert isinstance(item_getter(result, "a", 0), int)
assert isinstance(item_getter(result, "b", 0), float)
def test_frame_to_dict_tz(self):
# GH18372 When converting to dict with orient='records' columns of
# datetime that are tz-aware were not converted to required arrays
data = [
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
]
df = DataFrame(list(data), columns=["d"])
result = df.to_dict(orient="records")
expected = [
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
]
tm.assert_dict_equal(result[0], expected[0])
tm.assert_dict_equal(result[1], expected[1])
@pytest.mark.parametrize(
"into, expected",
[
(
dict,
{
0: {"int_col": 1, "float_col": 1.0},
1: {"int_col": 2, "float_col": 2.0},
2: {"int_col": 3, "float_col": 3.0},
},
),
(
OrderedDict,
OrderedDict(
[
(0, {"int_col": 1, "float_col": 1.0}),
(1, {"int_col": 2, "float_col": 2.0}),
(2, {"int_col": 3, "float_col": 3.0}),
]
),
),
(
defaultdict(list),
defaultdict(
list,
{
0: {"int_col": 1, "float_col": 1.0},
1: {"int_col": 2, "float_col": 2.0},
2: {"int_col": 3, "float_col": 3.0},
},
),
),
],
)
def test_to_dict_index_dtypes(self, into, expected):
# GH 18580
# When using to_dict(orient='index') on a dataframe with int
# and float columns only the int columns were cast to float
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
result = df.to_dict(orient="index", into=into)
cols = ["int_col", "float_col"]
result = DataFrame.from_dict(result, orient="index")[cols]
expected = DataFrame.from_dict(expected, orient="index")[cols]
tm.assert_frame_equal(result, expected)
def test_to_dict_numeric_names(self):
# https://github.com/pandas-dev/pandas/issues/24940
df = DataFrame({str(i): [i] for i in range(5)})
result = set(df.to_dict("records")[0].keys())
expected = set(df.columns)
assert result == expected
def test_to_dict_wide(self):
# https://github.com/pandas-dev/pandas/issues/24939
df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)})
result = df.to_dict("records")[0]
expected = {"A_{:d}".format(i): i for i in range(256)}
assert result == expected

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,479 @@
import re
import numpy as np
import pytest
from pandas import DataFrame, Series
import pandas.util.testing as tm
@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
def test_duplicated_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")
with pytest.raises(KeyError, match=msg):
df.duplicated(subset)
with pytest.raises(KeyError, match=msg):
df.drop_duplicates(subset)
@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes():
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {
"col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100)
}
df = DataFrame(data).T
result = df.duplicated()
# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
# False in this case
assert isinstance(result, Series)
assert result.dtype == np.bool
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_keep(keep, expected):
df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
@pytest.mark.parametrize(
"keep, expected",
[
("first", Series([False, False, True, False, True])),
("last", Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True])),
],
)
def test_duplicated_nan_none(keep, expected):
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object)
result = df.duplicated(keep=keep)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("keep", ["first", "last", False])
@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
def test_duplicated_subset(subset, keep):
df = DataFrame(
{
"A": [0, 1, 1, 2, 0],
"B": ["a", "b", "b", "c", "a"],
"C": [np.nan, 3, 3, None, np.nan],
}
)
if subset is None:
subset = list(df.columns)
elif isinstance(subset, str):
# need to have a DataFrame, not a Series
# -> select columns with singleton list, not string
subset = [subset]
expected = df[subset].duplicated(keep=keep)
result = df.duplicated(keep=keep, subset=subset)
tm.assert_series_equal(result, expected)
def test_drop_duplicates():
df = DataFrame(
{
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("AAA")
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep="last")
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep=False)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates(np.array(["AAA", "B"]))
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["AAA", "B"])
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AAA", "B"), keep="last")
expected = df.loc[[0, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AAA", "B"), keep=False)
expected = df.loc[[0]]
tm.assert_frame_equal(result, expected)
# consider everything
df2 = df.loc[:, ["AAA", "B", "C"]]
result = df2.drop_duplicates()
# in this case only
expected = df2.drop_duplicates(["AAA", "B"])
tm.assert_frame_equal(result, expected)
result = df2.drop_duplicates(keep="last")
expected = df2.drop_duplicates(["AAA", "B"], keep="last")
tm.assert_frame_equal(result, expected)
result = df2.drop_duplicates(keep=False)
expected = df2.drop_duplicates(["AAA", "B"], keep=False)
tm.assert_frame_equal(result, expected)
# integers
result = df.drop_duplicates("C")
expected = df.iloc[[0, 2]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep="last")
expected = df.iloc[[-2, -1]]
tm.assert_frame_equal(result, expected)
df["E"] = df["C"].astype("int8")
result = df.drop_duplicates("E")
expected = df.iloc[[0, 2]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("E", keep="last")
expected = df.iloc[[-2, -1]]
tm.assert_frame_equal(result, expected)
# GH 11376
df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
expected = df.loc[df.index != 3]
tm.assert_frame_equal(df.drop_duplicates(), expected)
df = DataFrame([[1, 0], [0, 2]])
tm.assert_frame_equal(df.drop_duplicates(), df)
df = DataFrame([[-2, 0], [0, -4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
x = np.iinfo(np.int64).max / 3 * 2
df = DataFrame([[-x, x], [0, x + 4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
df = DataFrame([[-x, x], [x, x + 4]])
tm.assert_frame_equal(df.drop_duplicates(), df)
# GH 11864
df = DataFrame([i] * 9 for i in range(16))
df = df.append([[1] + [0] * 8], ignore_index=True)
for keep in ["first", "last", False]:
assert df.duplicated(keep=keep).sum() == 0
def test_duplicated_on_empty_frame():
# GH 25184
df = DataFrame(columns=["a", "b"])
dupes = df.duplicated("a")
result = df[dupes]
expected = df.copy()
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_with_duplicate_column_names():
# GH17836
df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
result0 = df.drop_duplicates()
tm.assert_frame_equal(result0, df)
result1 = df.drop_duplicates("a")
expected1 = df[:2]
tm.assert_frame_equal(result1, expected1)
def test_drop_duplicates_for_take_all():
df = DataFrame(
{
"AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("AAA")
expected = df.iloc[[0, 1, 2, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep="last")
expected = df.iloc[[2, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("AAA", keep=False)
expected = df.iloc[[2, 6]]
tm.assert_frame_equal(result, expected)
# multiple columns
result = df.drop_duplicates(["AAA", "B"])
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["AAA", "B"], keep="last")
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["AAA", "B"], keep=False)
expected = df.iloc[[0, 1, 2, 6]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_tuple():
df = DataFrame(
{
("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates(("AA", "AB"))
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AA", "AB"), keep="last")
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(("AA", "AB"), keep=False)
expected = df.loc[[]] # empty df
assert len(result) == 0
tm.assert_frame_equal(result, expected)
# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates((("AA", "AB"), "B"))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"df",
[
DataFrame(),
DataFrame(columns=[]),
DataFrame(columns=["A", "B", "C"]),
DataFrame(index=[]),
DataFrame(index=["A", "B", "C"]),
],
)
def test_drop_duplicates_empty(df):
# GH 20516
result = df.drop_duplicates()
tm.assert_frame_equal(result, df)
result = df.copy()
result.drop_duplicates(inplace=True)
tm.assert_frame_equal(result, df)
def test_drop_duplicates_NA():
# none
df = DataFrame(
{
"A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("A")
expected = df.loc[[0, 2, 3]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep="last")
expected = df.loc[[1, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep=False)
expected = df.loc[[]] # empty df
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
result = df.drop_duplicates(["A", "B"])
expected = df.loc[[0, 2, 3, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["A", "B"], keep="last")
expected = df.loc[[1, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["A", "B"], keep=False)
expected = df.loc[[6]]
tm.assert_frame_equal(result, expected)
# nan
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
"D": range(8),
}
)
# single column
result = df.drop_duplicates("C")
expected = df[:2]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep="last")
expected = df.loc[[3, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep=False)
expected = df.loc[[]] # empty df
tm.assert_frame_equal(result, expected)
assert len(result) == 0
# multi column
result = df.drop_duplicates(["C", "B"])
expected = df.loc[[0, 1, 2, 4]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["C", "B"], keep="last")
expected = df.loc[[1, 3, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates(["C", "B"], keep=False)
expected = df.loc[[1]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_NA_for_take_all():
# none
df = DataFrame(
{
"A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
}
)
# single column
result = df.drop_duplicates("A")
expected = df.iloc[[0, 2, 3, 5, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep="last")
expected = df.iloc[[1, 4, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("A", keep=False)
expected = df.iloc[[5, 7]]
tm.assert_frame_equal(result, expected)
# nan
# single column
result = df.drop_duplicates("C")
expected = df.iloc[[0, 1, 5, 6]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep="last")
expected = df.iloc[[3, 5, 6, 7]]
tm.assert_frame_equal(result, expected)
result = df.drop_duplicates("C", keep=False)
expected = df.iloc[[5, 6]]
tm.assert_frame_equal(result, expected)
def test_drop_duplicates_inplace():
orig = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
df = orig.copy()
df.drop_duplicates("A", inplace=True)
expected = orig[:2]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates("A", keep="last", inplace=True)
expected = orig.loc[[6, 7]]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates("A", keep=False, inplace=True)
expected = orig.loc[[]]
result = df
tm.assert_frame_equal(result, expected)
assert len(df) == 0
# multi column
df = orig.copy()
df.drop_duplicates(["A", "B"], inplace=True)
expected = orig.loc[[0, 1, 2, 3]]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates(["A", "B"], keep="last", inplace=True)
expected = orig.loc[[0, 5, 6, 7]]
result = df
tm.assert_frame_equal(result, expected)
df = orig.copy()
df.drop_duplicates(["A", "B"], keep=False, inplace=True)
expected = orig.loc[[0]]
result = df
tm.assert_frame_equal(result, expected)
# consider everything
orig2 = orig.loc[:, ["A", "B", "C"]].copy()
df2 = orig2.copy()
df2.drop_duplicates(inplace=True)
# in this case only
expected = orig2.drop_duplicates(["A", "B"])
result = df2
tm.assert_frame_equal(result, expected)
df2 = orig2.copy()
df2.drop_duplicates(keep="last", inplace=True)
expected = orig2.drop_duplicates(["A", "B"], keep="last")
result = df2
tm.assert_frame_equal(result, expected)
df2 = orig2.copy()
df2.drop_duplicates(keep=False, inplace=True)
expected = orig2.drop_duplicates(["A", "B"], keep=False)
result = df2
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,120 @@
import numpy as np
import pytest
import pandas as pd
from pandas.util import testing as tm
def test_error():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
with pytest.raises(ValueError):
df.explode(list("AA"))
df.columns = list("AA")
with pytest.raises(ValueError):
df.explode("A")
def test_basic():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
result = df.explode("A")
expected = pd.DataFrame(
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
),
"B": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_multi_index_rows():
df = pd.DataFrame(
{"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
)
result = df.explode("A")
expected = pd.DataFrame(
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4],
index=pd.MultiIndex.from_tuples(
[
("a", 1),
("a", 1),
("a", 1),
("a", 2),
("b", 1),
("b", 2),
("b", 2),
]
),
dtype=object,
),
"B": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_multi_index_columns():
df = pd.DataFrame(
{("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
)
result = df.explode(("A", 1))
expected = pd.DataFrame(
{
("A", 1): pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4],
index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
dtype=object,
),
("A", 2): 1,
}
)
tm.assert_frame_equal(result, expected)
def test_usecase():
# explode a single column
# gh-10511
df = pd.DataFrame(
[[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
).set_index("C")
result = df.explode("B")
expected = pd.DataFrame(
{
"A": [11, 11, 11, 11, 11, 22, 22, 22],
"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
"C": [10, 10, 10, 10, 10, 20, 20, 20],
},
columns=list("ABC"),
).set_index("C")
tm.assert_frame_equal(result, expected)
# gh-8517
df = pd.DataFrame(
[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
columns=["dt", "name", "text"],
)
result = df.assign(text=df.text.str.split(" ")).explode("text")
expected = pd.DataFrame(
[
["2014-01-01", "Alice", "A"],
["2014-01-01", "Alice", "B"],
["2014-01-02", "Bob", "C"],
["2014-01-02", "Bob", "D"],
],
columns=["dt", "name", "text"],
index=[0, 0, 1, 1],
)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,195 @@
import numpy as np
import pytest
from pandas import DataFrame, Index, period_range
import pandas.util.testing as tm
@pytest.fixture
def frame_with_period_index():
return DataFrame(
data=np.arange(20).reshape(4, 5),
columns=list("abcde"),
index=period_range(start="2000", freq="A", periods=4),
)
@pytest.fixture
def left():
return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
@pytest.fixture
def right():
return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
@pytest.mark.parametrize(
"how, sort, expected",
[
("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
(
"left",
False,
DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
),
(
"left",
True,
DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
),
(
"right",
False,
DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]),
),
(
"right",
True,
DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]),
),
(
"outer",
False,
DataFrame(
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3],
),
),
(
"outer",
True,
DataFrame(
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
index=[0, 1, 2, 3],
),
),
],
)
def test_join(left, right, how, sort, expected):
result = left.join(right, how=how, sort=sort)
tm.assert_frame_equal(result, expected)
def test_join_index(float_frame):
# left / right
f = float_frame.loc[float_frame.index[:10], ["A", "B"]]
f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1]
joined = f.join(f2)
tm.assert_index_equal(f.index, joined.index)
expected_columns = Index(["A", "B", "C", "D"])
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how="left")
tm.assert_index_equal(joined.index, f.index)
tm.assert_index_equal(joined.columns, expected_columns)
joined = f.join(f2, how="right")
tm.assert_index_equal(joined.index, f2.index)
tm.assert_index_equal(joined.columns, expected_columns)
# inner
joined = f.join(f2, how="inner")
tm.assert_index_equal(joined.index, f.index[5:10])
tm.assert_index_equal(joined.columns, expected_columns)
# outer
joined = f.join(f2, how="outer")
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
tm.assert_index_equal(joined.columns, expected_columns)
with pytest.raises(ValueError, match="join method"):
f.join(f2, how="foo")
# corner case - overlapping columns
msg = "columns overlap but no suffix"
for how in ("outer", "left", "inner"):
with pytest.raises(ValueError, match=msg):
float_frame.join(float_frame, how=how)
def test_join_index_more(float_frame):
af = float_frame.loc[:, ["A", "B"]]
bf = float_frame.loc[::2, ["C", "D"]]
expected = af.copy()
expected["C"] = float_frame["C"][::2]
expected["D"] = float_frame["D"][::2]
result = af.join(bf)
tm.assert_frame_equal(result, expected)
result = af.join(bf, how="right")
tm.assert_frame_equal(result, expected[::2])
result = bf.join(af, how="right")
tm.assert_frame_equal(result, expected.loc[:, result.columns])
def test_join_index_series(float_frame):
df = float_frame.copy()
s = df.pop(float_frame.columns[-1])
joined = df.join(s)
# TODO should this check_names ?
tm.assert_frame_equal(joined, float_frame, check_names=False)
s.name = None
with pytest.raises(ValueError, match="must have a name"):
df.join(s)
def test_join_overlap(float_frame):
df1 = float_frame.loc[:, ["A", "B", "C"]]
df2 = float_frame.loc[:, ["B", "C", "D"]]
joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2")
df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1")
df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2")
no_overlap = float_frame.loc[:, ["A", "D"]]
expected = df1_suf.join(df2_suf).join(no_overlap)
# column order not necessarily sorted
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
def test_join_period_index(frame_with_period_index):
other = frame_with_period_index.rename(columns=lambda x: "{key}{key}".format(key=x))
joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1)
joined_cols = frame_with_period_index.columns.append(other.columns)
joined = frame_with_period_index.join(other)
expected = DataFrame(
data=joined_values, columns=joined_cols, index=frame_with_period_index.index
)
tm.assert_frame_equal(joined, expected)
def test_join_left_sequence_non_unique_index():
# https://github.com/pandas-dev/pandas/issues/19607
df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3])
df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2])
df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4])
joined = df1.join([df2, df3], how="left")
expected = DataFrame(
{
"a": [0, 10, 10, 20],
"b": [np.nan, 300, 300, 200],
"c": [np.nan, 400, 500, np.nan],
},
index=[1, 2, 2, 3],
)
tm.assert_frame_equal(joined, expected)

View File

@@ -0,0 +1,972 @@
import datetime
import dateutil
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import Categorical, DataFrame, Series, Timestamp, date_range
from pandas.tests.frame.common import _check_mixed_float
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
def _skip_if_no_pchip():
try:
from scipy.interpolate import pchip_interpolate # noqa
except ImportError:
import pytest
pytest.skip("scipy.interpolate.pchip missing")
class TestDataFrameMissingData:
def test_dropEmptyRows(self, float_frame):
N = len(float_frame.index)
mat = np.random.randn(N)
mat[:5] = np.nan
frame = DataFrame({"foo": mat}, index=float_frame.index)
original = Series(mat, index=float_frame.index, name="foo")
expected = original.dropna()
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna(how="all")
# check that original was preserved
assert_series_equal(frame["foo"], original)
inplace_frame1.dropna(how="all", inplace=True)
assert_series_equal(smaller_frame["foo"], expected)
assert_series_equal(inplace_frame1["foo"], expected)
smaller_frame = frame.dropna(how="all", subset=["foo"])
inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
assert_series_equal(smaller_frame["foo"], expected)
assert_series_equal(inplace_frame2["foo"], expected)
def test_dropIncompleteRows(self, float_frame):
N = len(float_frame.index)
mat = np.random.randn(N)
mat[:5] = np.nan
frame = DataFrame({"foo": mat}, index=float_frame.index)
frame["bar"] = 5
original = Series(mat, index=float_frame.index, name="foo")
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
smaller_frame = frame.dropna()
assert_series_equal(frame["foo"], original)
inp_frame1.dropna(inplace=True)
exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
tm.assert_series_equal(smaller_frame["foo"], exp)
tm.assert_series_equal(inp_frame1["foo"], exp)
samesize_frame = frame.dropna(subset=["bar"])
assert_series_equal(frame["foo"], original)
assert (frame["bar"] == 5).all()
inp_frame2.dropna(subset=["bar"], inplace=True)
tm.assert_index_equal(samesize_frame.index, float_frame.index)
tm.assert_index_equal(inp_frame2.index, float_frame.index)
def test_dropna(self):
df = DataFrame(np.random.randn(6, 4))
df[2][:2] = np.nan
dropped = df.dropna(axis=1)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
inp.dropna(axis=1, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=0)
expected = df.loc[list(range(2, 6))]
inp = df.copy()
inp.dropna(axis=0, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
# threshold
dropped = df.dropna(axis=1, thresh=5)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
inp.dropna(axis=1, thresh=5, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=0, thresh=4)
expected = df.loc[range(2, 6)]
inp = df.copy()
inp.dropna(axis=0, thresh=4, inplace=True)
assert_frame_equal(dropped, expected)
assert_frame_equal(inp, expected)
dropped = df.dropna(axis=1, thresh=4)
assert_frame_equal(dropped, df)
dropped = df.dropna(axis=1, thresh=3)
assert_frame_equal(dropped, df)
# subset
dropped = df.dropna(axis=0, subset=[0, 1, 3])
inp = df.copy()
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
assert_frame_equal(dropped, df)
assert_frame_equal(inp, df)
# all
dropped = df.dropna(axis=1, how="all")
assert_frame_equal(dropped, df)
df[2] = np.nan
dropped = df.dropna(axis=1, how="all")
expected = df.loc[:, [0, 1, 3]]
assert_frame_equal(dropped, expected)
# bad input
msg = "No axis named 3 for object type <class 'pandas.core.frame.DataFrame'>"
with pytest.raises(ValueError, match=msg):
df.dropna(axis=3)
def test_drop_and_dropna_caching(self):
# tst that cacher updates
original = Series([1, 2, np.nan], name="A")
expected = Series([1, 2], dtype=original.dtype, name="A")
df = pd.DataFrame({"A": original.values.copy()})
df2 = df.copy()
df["A"].dropna()
assert_series_equal(df["A"], original)
df["A"].dropna(inplace=True)
assert_series_equal(df["A"], expected)
df2["A"].drop([1])
assert_series_equal(df2["A"], original)
df2["A"].drop([1], inplace=True)
assert_series_equal(df2["A"], original.drop([1]))
def test_dropna_corner(self, float_frame):
# bad input
msg = "invalid how option: foo"
with pytest.raises(ValueError, match=msg):
float_frame.dropna(how="foo")
msg = "must specify how or thresh"
with pytest.raises(TypeError, match=msg):
float_frame.dropna(how=None)
# non-existent column - 8303
with pytest.raises(KeyError, match=r"^\['X'\]$"):
float_frame.dropna(subset=["A", "X"])
def test_dropna_multiple_axes(self):
df = DataFrame(
[
[1, np.nan, 2, 3],
[4, np.nan, 5, 6],
[np.nan, np.nan, np.nan, np.nan],
[7, np.nan, 8, 9],
]
)
cp = df.copy()
# GH20987
with tm.assert_produces_warning(FutureWarning):
result = df.dropna(how="all", axis=[0, 1])
with tm.assert_produces_warning(FutureWarning):
result2 = df.dropna(how="all", axis=(0, 1))
expected = df.dropna(how="all").dropna(how="all", axis=1)
assert_frame_equal(result, expected)
assert_frame_equal(result2, expected)
assert_frame_equal(df, cp)
inp = df.copy()
with tm.assert_produces_warning(FutureWarning):
inp.dropna(how="all", axis=(0, 1), inplace=True)
assert_frame_equal(inp, expected)
def test_dropna_tz_aware_datetime(self):
# GH13407
df = DataFrame()
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
df["Time"] = [dt1]
result = df.dropna(axis=0)
expected = DataFrame({"Time": [dt1]})
assert_frame_equal(result, expected)
# Ex2
df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
result = df.dropna(axis=0)
expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
assert_frame_equal(result, expected)
def test_dropna_categorical_interval_index(self):
# GH 25087
ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
ci = pd.CategoricalIndex(ii)
df = pd.DataFrame({"A": list("abc")}, index=ci)
expected = df
result = df.dropna()
tm.assert_frame_equal(result, expected)
def test_fillna_datetime(self, datetime_frame):
tf = datetime_frame
tf.loc[tf.index[:5], "A"] = np.nan
tf.loc[tf.index[-5:], "A"] = np.nan
zero_filled = datetime_frame.fillna(0)
assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all()
padded = datetime_frame.fillna(method="pad")
assert np.isnan(padded.loc[padded.index[:5], "A"]).all()
assert (
padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"]
).all()
msg = "Must specify a fill 'value' or 'method'"
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna()
msg = "Cannot specify both 'value' and 'method'"
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna(5, method="ffill")
def test_fillna_mixed_type(self, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan
# TODO: make stronger assertion here, GH 25640
mf.fillna(value=0)
mf.fillna(method="pad")
def test_fillna_mixed_float(self, mixed_float_frame):
# mixed numeric (but no float16)
mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
mf.loc[mf.index[-10:], "A"] = np.nan
result = mf.fillna(value=0)
_check_mixed_float(result, dtype=dict(C=None))
result = mf.fillna(method="pad")
_check_mixed_float(result, dtype=dict(C=None))
def test_fillna_empty(self):
# empty frame (GH #2778)
df = DataFrame(columns=["x"])
for m in ["pad", "backfill"]:
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)
def test_fillna_different_dtype(self):
# with different dtype (GH#3386)
df = DataFrame(
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
)
result = df.fillna({2: "foo"})
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
assert_frame_equal(result, expected)
df.fillna({2: "foo"}, inplace=True)
assert_frame_equal(df, expected)
def test_fillna_limit_and_value(self):
# limit and value
df = DataFrame(np.random.randn(10, 3))
df.iloc[2:7, 0] = np.nan
df.iloc[3:5, 2] = np.nan
expected = df.copy()
expected.iloc[2, 0] = 999
expected.iloc[3, 2] = 999
result = df.fillna(999, limit=1)
assert_frame_equal(result, expected)
def test_fillna_datelike(self):
# with datelike
# GH#6344
df = DataFrame(
{
"Date": [pd.NaT, Timestamp("2014-1-1")],
"Date2": [Timestamp("2013-1-1"), pd.NaT],
}
)
expected = df.copy()
expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"])
result = df.fillna(value={"Date": df["Date2"]})
assert_frame_equal(result, expected)
def test_fillna_tzaware(self):
# with timezone
# GH#15855
df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]})
exp = pd.DataFrame(
{
"A": [
pd.Timestamp("2012-11-11 00:00:00+01:00"),
pd.Timestamp("2012-11-11 00:00:00+01:00"),
]
}
)
assert_frame_equal(df.fillna(method="pad"), exp)
df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]})
exp = pd.DataFrame(
{
"A": [
pd.Timestamp("2012-11-11 00:00:00+01:00"),
pd.Timestamp("2012-11-11 00:00:00+01:00"),
]
}
)
assert_frame_equal(df.fillna(method="bfill"), exp)
def test_fillna_tzaware_different_column(self):
# with timezone in another column
# GH#15522
df = pd.DataFrame(
{
"A": pd.date_range("20130101", periods=4, tz="US/Eastern"),
"B": [1, 2, np.nan, np.nan],
}
)
result = df.fillna(method="pad")
expected = pd.DataFrame(
{
"A": pd.date_range("20130101", periods=4, tz="US/Eastern"),
"B": [1.0, 2.0, 2.0, 2.0],
}
)
assert_frame_equal(result, expected)
def test_na_actions_categorical(self):
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
vals = ["a", "b", np.nan, "d"]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
vals2 = ["a", "b", "b", "d"]
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
vals3 = ["a", "b", np.nan]
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
cat4 = Categorical([1, 2], categories=[1, 2, 3])
vals4 = ["a", "b"]
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
# fillna
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)
with pytest.raises(ValueError, match=("fill value must be in categories")):
df.fillna(value={"cats": 4, "vals": "c"})
res = df.fillna(method="pad")
tm.assert_frame_equal(res, df_exp_fill)
# dropna
res = df.dropna(subset=["cats"])
tm.assert_frame_equal(res, df_exp_drop_cats)
res = df.dropna()
tm.assert_frame_equal(res, df_exp_drop_all)
# make sure that fillna takes missing values into account
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
res = df.fillna("a")
tm.assert_frame_equal(res, df_exp)
def test_fillna_categorical_nan(self):
# GH 14021
# np.nan should always be a valid filler
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
res = df.fillna(df.median())
v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
tm.assert_frame_equal(res, df_exp)
result = df.cats.fillna(np.nan)
tm.assert_series_equal(result, df.cats)
result = df.vals.fillna(np.nan)
tm.assert_series_equal(result, df.vals)
idx = pd.DatetimeIndex(
["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT]
)
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
idx = pd.PeriodIndex(
["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M"
)
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT])
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
def test_fillna_downcast(self):
# GH 15277
# infer int64 from float64
df = pd.DataFrame({"a": [1.0, np.nan]})
result = df.fillna(0, downcast="infer")
expected = pd.DataFrame({"a": [1, 0]})
assert_frame_equal(result, expected)
# infer int64 from float64 when fillna value is a dict
df = pd.DataFrame({"a": [1.0, np.nan]})
result = df.fillna({"a": 0}, downcast="infer")
expected = pd.DataFrame({"a": [1, 0]})
assert_frame_equal(result, expected)
def test_fillna_dtype_conversion(self):
# make sure that fillna on an empty frame works
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
result = df.dtypes
expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5])
assert_series_equal(result, expected)
result = df.fillna(1)
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
assert_frame_equal(result, expected)
# empty block
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
result = df.fillna("nan")
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
assert_frame_equal(result, expected)
# equiv of replace
df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0]))
for v in ["", 1, np.nan, 1.0]:
expected = df.replace(np.nan, v)
result = df.fillna(v)
assert_frame_equal(result, expected)
def test_fillna_datetime_columns(self):
# GH 7095
df = pd.DataFrame(
{
"A": [-1, -2, np.nan],
"B": date_range("20130101", periods=3),
"C": ["foo", "bar", None],
"D": ["foo2", "bar2", None],
},
index=date_range("20130110", periods=3),
)
result = df.fillna("?")
expected = pd.DataFrame(
{
"A": [-1, -2, "?"],
"B": date_range("20130101", periods=3),
"C": ["foo", "bar", "?"],
"D": ["foo2", "bar2", "?"],
},
index=date_range("20130110", periods=3),
)
tm.assert_frame_equal(result, expected)
df = pd.DataFrame(
{
"A": [-1, -2, np.nan],
"B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT],
"C": ["foo", "bar", None],
"D": ["foo2", "bar2", None],
},
index=date_range("20130110", periods=3),
)
result = df.fillna("?")
expected = pd.DataFrame(
{
"A": [-1, -2, "?"],
"B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"],
"C": ["foo", "bar", "?"],
"D": ["foo2", "bar2", "?"],
},
index=pd.date_range("20130110", periods=3),
)
tm.assert_frame_equal(result, expected)
def test_ffill(self, datetime_frame):
datetime_frame["A"][:5] = np.nan
datetime_frame["A"][-5:] = np.nan
assert_frame_equal(
datetime_frame.ffill(), datetime_frame.fillna(method="ffill")
)
def test_bfill(self, datetime_frame):
datetime_frame["A"][:5] = np.nan
datetime_frame["A"][-5:] = np.nan
assert_frame_equal(
datetime_frame.bfill(), datetime_frame.fillna(method="bfill")
)
def test_frame_pad_backfill_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index, method="pad", limit=5)
expected = df[:2].reindex(index).fillna(method="pad")
expected.values[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index, method="backfill", limit=5)
expected = df[-2:].reindex(index).fillna(method="backfill")
expected.values[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_frame_fillna_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
result = df[:2].reindex(index)
result = result.fillna(method="pad", limit=5)
expected = df[:2].reindex(index).fillna(method="pad")
expected.values[-3:] = np.nan
tm.assert_frame_equal(result, expected)
result = df[-2:].reindex(index)
result = result.fillna(method="backfill", limit=5)
expected = df[-2:].reindex(index).fillna(method="backfill")
expected.values[:3] = np.nan
tm.assert_frame_equal(result, expected)
def test_fillna_skip_certain_blocks(self):
# don't try to fill boolean, int blocks
df = DataFrame(np.random.randn(10, 4).astype(int))
# it works!
df.fillna(np.nan)
@pytest.mark.parametrize("type", [int, float])
def test_fillna_positive_limit(self, type):
df = DataFrame(np.random.randn(10, 4)).astype(type)
msg = "Limit must be greater than 0"
with pytest.raises(ValueError, match=msg):
df.fillna(0, limit=-5)
@pytest.mark.parametrize("type", [int, float])
def test_fillna_integer_limit(self, type):
df = DataFrame(np.random.randn(10, 4)).astype(type)
msg = "Limit must be an integer"
with pytest.raises(ValueError, match=msg):
df.fillna(0, limit=0.5)
def test_fillna_inplace(self):
df = DataFrame(np.random.randn(10, 4))
df[1][:4] = np.nan
df[3][-4:] = np.nan
expected = df.fillna(value=0)
assert expected is not df
df.fillna(value=0, inplace=True)
tm.assert_frame_equal(df, expected)
expected = df.fillna(value={0: 0}, inplace=True)
assert expected is None
df[1][:4] = np.nan
df[3][-4:] = np.nan
expected = df.fillna(method="ffill")
assert expected is not df
df.fillna(method="ffill", inplace=True)
tm.assert_frame_equal(df, expected)
def test_fillna_dict_series(self):
df = DataFrame(
{
"a": [np.nan, 1, 2, np.nan, np.nan],
"b": [1, 2, 3, np.nan, np.nan],
"c": [np.nan, 1, 2, 3, 4],
}
)
result = df.fillna({"a": 0, "b": 5})
expected = df.copy()
expected["a"] = expected["a"].fillna(0)
expected["b"] = expected["b"].fillna(5)
assert_frame_equal(result, expected)
# it works
result = df.fillna({"a": 0, "b": 5, "d": 7})
# Series treated same as dict
result = df.fillna(df.max())
expected = df.fillna(df.max().to_dict())
assert_frame_equal(result, expected)
# disable this for now
with pytest.raises(NotImplementedError, match="column by column"):
df.fillna(df.max(1), axis=1)
def test_fillna_dataframe(self):
# GH 8377
df = DataFrame(
{
"a": [np.nan, 1, 2, np.nan, np.nan],
"b": [1, 2, 3, np.nan, np.nan],
"c": [np.nan, 1, 2, 3, 4],
},
index=list("VWXYZ"),
)
# df2 may have different index and columns
df2 = DataFrame(
{
"a": [np.nan, 10, 20, 30, 40],
"b": [50, 60, 70, 80, 90],
"foo": ["bar"] * 5,
},
index=list("VWXuZ"),
)
result = df.fillna(df2)
# only those columns and indices which are shared get filled
expected = DataFrame(
{
"a": [np.nan, 1, 2, np.nan, 40],
"b": [1, 2, 3, np.nan, 90],
"c": [np.nan, 1, 2, 3, 4],
},
index=list("VWXYZ"),
)
assert_frame_equal(result, expected)
def test_fillna_columns(self):
df = DataFrame(np.random.randn(10, 10))
df.values[:, ::2] = np.nan
result = df.fillna(method="ffill", axis=1)
expected = df.T.fillna(method="pad").T
assert_frame_equal(result, expected)
df.insert(6, "foo", 5)
result = df.fillna(method="ffill", axis=1)
expected = df.astype(float).fillna(method="ffill", axis=1)
assert_frame_equal(result, expected)
def test_fillna_invalid_method(self, float_frame):
with pytest.raises(ValueError, match="ffil"):
float_frame.fillna(method="ffil")
def test_fillna_invalid_value(self, float_frame):
# list
msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"'
with pytest.raises(TypeError, match=msg.format("list")):
float_frame.fillna([1, 2])
# tuple
with pytest.raises(TypeError, match=msg.format("tuple")):
float_frame.fillna((1, 2))
# frame with series
msg = (
'"value" parameter must be a scalar, dict or Series, but you'
' passed a "DataFrame"'
)
with pytest.raises(TypeError, match=msg):
float_frame.iloc[:, 0].fillna(float_frame)
def test_fillna_col_reordering(self):
cols = ["COL." + str(i) for i in range(5, 0, -1)]
data = np.random.rand(20, 5)
df = DataFrame(index=range(20), columns=cols, data=data)
filled = df.fillna(method="ffill")
assert df.columns.tolist() == filled.columns.tolist()
def test_fill_corner(self, float_frame, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan
filled = float_string_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
del float_string_frame["foo"]
empty_float = float_frame.reindex(columns=[])
# TODO(wesm): unused?
result = empty_float.fillna(value=0) # noqa
def test_fill_value_when_combine_const(self):
# GH12723
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float")
df = DataFrame({"foo": dat}, index=range(6))
exp = df.fillna(0).add(2)
res = df.add(2, fill_value=0)
assert_frame_equal(res, exp)
class TestDataFrameInterpolate:
def test_interp_basic(self):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
expected = DataFrame(
{
"A": [1.0, 2.0, 3.0, 4.0],
"B": [1.0, 4.0, 9.0, 9.0],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
result = df.interpolate()
assert_frame_equal(result, expected)
result = df.set_index("C").interpolate()
expected = df.set_index("C")
expected.loc[3, "A"] = 3
expected.loc[5, "B"] = 9
assert_frame_equal(result, expected)
def test_interp_bad_method(self):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
with pytest.raises(ValueError):
df.interpolate(method="not_a_method")
def test_interp_combo(self):
df = DataFrame(
{
"A": [1.0, 2.0, np.nan, 4.0],
"B": [1, 4, 9, np.nan],
"C": [1, 2, 3, 5],
"D": list("abcd"),
}
)
result = df["A"].interpolate()
expected = Series([1.0, 2.0, 3.0, 4.0], name="A")
assert_series_equal(result, expected)
result = df["A"].interpolate(downcast="infer")
expected = Series([1, 2, 3, 4], name="A")
assert_series_equal(result, expected)
def test_interp_nan_idx(self):
df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]})
df = df.set_index("A")
with pytest.raises(NotImplementedError):
df.interpolate(method="values")
@td.skip_if_no_scipy
def test_interp_various(self):
df = DataFrame(
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
)
df = df.set_index("C")
expected = df.copy()
result = df.interpolate(method="polynomial", order=1)
expected.A.loc[3] = 2.66666667
expected.A.loc[13] = 5.76923076
assert_frame_equal(result, expected)
result = df.interpolate(method="cubic")
# GH #15662.
expected.A.loc[3] = 2.81547781
expected.A.loc[13] = 5.52964175
assert_frame_equal(result, expected)
result = df.interpolate(method="nearest")
expected.A.loc[3] = 2
expected.A.loc[13] = 5
assert_frame_equal(result, expected, check_dtype=False)
result = df.interpolate(method="quadratic")
expected.A.loc[3] = 2.82150771
expected.A.loc[13] = 6.12648668
assert_frame_equal(result, expected)
result = df.interpolate(method="slinear")
expected.A.loc[3] = 2.66666667
expected.A.loc[13] = 5.76923077
assert_frame_equal(result, expected)
result = df.interpolate(method="zero")
expected.A.loc[3] = 2.0
expected.A.loc[13] = 5
assert_frame_equal(result, expected, check_dtype=False)
@td.skip_if_no_scipy
def test_interp_alt_scipy(self):
df = DataFrame(
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
)
result = df.interpolate(method="barycentric")
expected = df.copy()
expected.loc[2, "A"] = 3
expected.loc[5, "A"] = 6
assert_frame_equal(result, expected)
result = df.interpolate(method="barycentric", downcast="infer")
assert_frame_equal(result, expected.astype(np.int64))
result = df.interpolate(method="krogh")
expectedk = df.copy()
expectedk["A"] = expected["A"]
assert_frame_equal(result, expectedk)
_skip_if_no_pchip()
result = df.interpolate(method="pchip")
expected.loc[2, "A"] = 3
expected.loc[5, "A"] = 6.0
assert_frame_equal(result, expected)
def test_interp_rowwise(self):
df = DataFrame(
{
0: [1, 2, np.nan, 4],
1: [2, 3, 4, np.nan],
2: [np.nan, 4, 5, 6],
3: [4, np.nan, 6, 7],
4: [1, 2, 3, 4],
}
)
result = df.interpolate(axis=1)
expected = df.copy()
expected.loc[3, 1] = 5
expected.loc[0, 2] = 3
expected.loc[1, 3] = 3
expected[4] = expected[4].astype(np.float64)
assert_frame_equal(result, expected)
result = df.interpolate(axis=1, method="values")
assert_frame_equal(result, expected)
result = df.interpolate(axis=0)
expected = df.interpolate()
assert_frame_equal(result, expected)
def test_rowwise_alt(self):
df = DataFrame(
{
0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64],
1: [1, 2, 3, 4, 3, 2, 1, 0, -1],
}
)
df.interpolate(axis=0)
@pytest.mark.parametrize(
"check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)]
)
def test_interp_leading_nans(self, check_scipy):
df = DataFrame(
{"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}
)
result = df.interpolate()
expected = df.copy()
expected["B"].loc[3] = -3.75
assert_frame_equal(result, expected)
if check_scipy:
result = df.interpolate(method="polynomial", order=1)
assert_frame_equal(result, expected)
def test_interp_raise_on_only_mixed(self):
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": ["a", "b", "c", "d"],
"C": [np.nan, 2, 5, 7],
"D": [np.nan, np.nan, 9, 9],
"E": [1, 2, 3, 4],
}
)
with pytest.raises(TypeError):
df.interpolate(axis=1)
def test_interp_raise_on_all_object_dtype(self):
# GH 22985
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
msg = (
"Cannot interpolate with all object-dtype columns "
"in the DataFrame. Try setting at least one "
"column to a numeric dtype."
)
with pytest.raises(TypeError, match=msg):
df.interpolate()
def test_interp_inplace(self):
df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
result = df.copy()
result["a"].interpolate(inplace=True)
assert_frame_equal(result, expected)
result = df.copy()
result["a"].interpolate(inplace=True, downcast="infer")
assert_frame_equal(result, expected.astype("int64"))
def test_interp_inplace_row(self):
# GH 10395
result = DataFrame(
{"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
)
expected = result.interpolate(method="linear", axis=1, inplace=False)
result.interpolate(method="linear", axis=1, inplace=True)
assert_frame_equal(result, expected)
def test_interp_ignore_all_good(self):
# GH
df = DataFrame(
{
"A": [1, 2, np.nan, 4],
"B": [1, 2, 3, 4],
"C": [1.0, 2.0, np.nan, 4.0],
"D": [1.0, 2.0, 3.0, 4.0],
}
)
expected = DataFrame(
{
"A": np.array([1, 2, 3, 4], dtype="float64"),
"B": np.array([1, 2, 3, 4], dtype="int64"),
"C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
"D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
}
)
result = df.interpolate(downcast=None)
assert_frame_equal(result, expected)
# all good
result = df[["B", "D"]].interpolate(downcast=None)
assert_frame_equal(result, df[["B", "D"]])

View File

@@ -0,0 +1,287 @@
import re
import numpy as np
import pytest
from pandas.compat import PY36
from pandas import DataFrame, Index, MultiIndex, Series
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
# Column add, remove, delete.
class TestDataFrameMutateColumns:
def test_assign(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
original = df.copy()
result = df.assign(C=df.B / df.A)
expected = df.copy()
expected["C"] = [4, 2.5, 2]
assert_frame_equal(result, expected)
# lambda syntax
result = df.assign(C=lambda x: x.B / x.A)
assert_frame_equal(result, expected)
# original is unmodified
assert_frame_equal(df, original)
# Non-Series array-like
result = df.assign(C=[4, 2.5, 2])
assert_frame_equal(result, expected)
# original is unmodified
assert_frame_equal(df, original)
result = df.assign(B=df.B / df.A)
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
assert_frame_equal(result, expected)
# overwrite
result = df.assign(A=df.A + df.B)
expected = df.copy()
expected["A"] = [5, 7, 9]
assert_frame_equal(result, expected)
# lambda
result = df.assign(A=lambda x: x.A + x.B)
assert_frame_equal(result, expected)
def test_assign_multiple(self):
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
expected = DataFrame(
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
)
assert_frame_equal(result, expected)
def test_assign_order(self):
# GH 9818
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
result = df.assign(D=df.A + df.B, C=df.A - df.B)
if PY36:
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
else:
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
assert_frame_equal(result, expected)
result = df.assign(C=df.A - df.B, D=df.A + df.B)
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
assert_frame_equal(result, expected)
def test_assign_bad(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
# non-keyword argument
with pytest.raises(TypeError):
df.assign(lambda x: x.A)
with pytest.raises(AttributeError):
df.assign(C=df.A, D=df.A + df.C)
@pytest.mark.skipif(
PY36,
reason="""Issue #14207: valid for python
3.6 and above""",
)
def test_assign_dependent_old_python(self):
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
# Key C does not exist at definition time of df
with pytest.raises(KeyError, match="^'C'$"):
df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
with pytest.raises(KeyError, match="^'C'$"):
df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
@pytest.mark.skipif(
not PY36,
reason="""Issue #14207: not valid for
python 3.5 and below""",
)
def test_assign_dependent(self):
df = DataFrame({"A": [1, 2], "B": [3, 4]})
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
assert_frame_equal(result, expected)
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
assert_frame_equal(result, expected)
def test_insert_error_msmgs(self):
# GH 7432
df = DataFrame(
{"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]}
).set_index("foo")
s = DataFrame(
{"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]}
).set_index("foo")
msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df["newcol"] = s
# GH 4107, more descriptive error message
df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
msg = "incompatible index of inserted column with frame index"
with pytest.raises(TypeError, match=msg):
df["gr"] = df.groupby(["b", "c"]).count()
def test_insert_benchmark(self):
# from the vb_suite/frame_methods/frame_insert_columns
N = 10
K = 5
df = DataFrame(index=range(N))
new_col = np.random.randn(N)
for i in range(K):
df[i] = new_col
expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N))
assert_frame_equal(df, expected)
def test_insert(self):
df = DataFrame(
np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
)
df.insert(0, "foo", df["a"])
tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"]))
tm.assert_series_equal(df["a"], df["foo"], check_names=False)
df.insert(2, "bar", df["c"])
tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"]))
tm.assert_almost_equal(df["c"], df["bar"], check_names=False)
# diff dtype
# new item
df["x"] = df["a"].astype("float32")
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 5 + [np.dtype("float32")],
index=["foo", "c", "bar", "b", "a", "x"],
)
tm.assert_series_equal(result, expected)
# replacing current (in different block)
df["a"] = df["a"].astype("float32")
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
index=["foo", "c", "bar", "b", "a", "x"],
)
tm.assert_series_equal(result, expected)
df["y"] = df["a"].astype("int32")
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")],
index=["foo", "c", "bar", "b", "a", "x", "y"],
)
tm.assert_series_equal(result, expected)
with pytest.raises(ValueError, match="already exists"):
df.insert(1, "a", df["b"])
msg = "cannot insert c, already exists"
with pytest.raises(ValueError, match=msg):
df.insert(1, "c", df["b"])
df.columns.name = "some_name"
# preserve columns name field
df.insert(0, "baz", df["c"])
assert df.columns.name == "some_name"
# GH 13522
df = DataFrame(index=["A", "B", "C"])
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
assert_frame_equal(df, exp)
def test_delitem(self, float_frame):
del float_frame["A"]
assert "A" not in float_frame
def test_delitem_multiindex(self):
midx = MultiIndex.from_product([["A", "B"], [1, 2]])
df = DataFrame(np.random.randn(4, 4), columns=midx)
assert len(df.columns) == 4
assert ("A",) in df.columns
assert "A" in df.columns
result = df["A"]
assert isinstance(result, DataFrame)
del df["A"]
assert len(df.columns) == 2
# A still in the levels, BUT get a KeyError if trying
# to delete
assert ("A",) not in df.columns
with pytest.raises(KeyError, match=re.escape("('A',)")):
del df[("A",)]
# behavior of dropped/deleted MultiIndex levels changed from
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
# levels which are dropped/deleted
assert "A" not in df.columns
with pytest.raises(KeyError, match=re.escape("('A',)")):
del df["A"]
def test_pop(self, float_frame):
float_frame.columns.name = "baz"
float_frame.pop("A")
assert "A" not in float_frame
float_frame["foo"] = "bar"
float_frame.pop("foo")
assert "foo" not in float_frame
assert float_frame.columns.name == "baz"
# gh-10912: inplace ops cause caching issue
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"])
b = a.pop("B")
b += 1
# original frame
expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"])
tm.assert_frame_equal(a, expected)
# result
expected = Series([2, 5], index=["X", "Y"], name="B") + 1
tm.assert_series_equal(b, expected)
def test_pop_non_unique_cols(self):
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
df.columns = ["a", "b", "a"]
res = df.pop("a")
assert type(res) == DataFrame
assert len(res) == 2
assert len(df.columns) == 1
assert "b" in df.columns
assert "a" not in df.columns
assert len(df.index) == 2
def test_insert_column_bug_4032(self):
# GH4032, inserting a column and renaming causing errors
df = DataFrame({"b": [1.1, 2.2]})
df = df.rename(columns={})
df.insert(0, "a", [1, 2])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"])
assert_frame_equal(result, expected)
df.insert(0, "c", [1.3, 2.3])
result = df.rename(columns={})
str(result)
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"])
assert_frame_equal(result, expected)

View File

@@ -0,0 +1,528 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, MultiIndex, Series, date_range
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameNonuniqueIndexes(TestData):
def test_column_dups_operations(self):
def check(result, expected=None):
if expected is not None:
assert_frame_equal(result, expected)
result.dtypes
str(result)
# assignment
# GH 3687
arr = np.random.randn(3, 2)
idx = list(range(2))
df = DataFrame(arr, columns=["A", "A"])
df.columns = idx
expected = DataFrame(arr, columns=idx)
check(df, expected)
idx = date_range("20130101", periods=4, freq="Q-NOV")
df = DataFrame(
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
)
df.columns = idx
expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
check(df, expected)
# insert
df = DataFrame(
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
columns=["foo", "bar", "foo", "hello"],
)
df["string"] = "bah"
expected = DataFrame(
[[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
columns=["foo", "bar", "foo", "hello", "string"],
)
check(df, expected)
with pytest.raises(ValueError, match="Length of value"):
df.insert(0, "AnotherColumn", range(len(df.index) - 1))
# insert same dtype
df["foo2"] = 3
expected = DataFrame(
[[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
columns=["foo", "bar", "foo", "hello", "string", "foo2"],
)
check(df, expected)
# set (non-dup)
df["foo2"] = 4
expected = DataFrame(
[[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
columns=["foo", "bar", "foo", "hello", "string", "foo2"],
)
check(df, expected)
df["foo2"] = 3
# delete (non dup)
del df["bar"]
expected = DataFrame(
[[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
columns=["foo", "foo", "hello", "string", "foo2"],
)
check(df, expected)
# try to delete again (its not consolidated)
del df["hello"]
expected = DataFrame(
[[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
columns=["foo", "foo", "string", "foo2"],
)
check(df, expected)
# consolidate
df = df._consolidate()
expected = DataFrame(
[[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
columns=["foo", "foo", "string", "foo2"],
)
check(df, expected)
# insert
df.insert(2, "new_col", 5.0)
expected = DataFrame(
[[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
columns=["foo", "foo", "new_col", "string", "foo2"],
)
check(df, expected)
# insert a dup
with pytest.raises(ValueError, match="cannot insert"):
df.insert(2, "new_col", 4.0)
df.insert(2, "new_col", 4.0, allow_duplicates=True)
expected = DataFrame(
[
[1, 1, 4.0, 5.0, "bah", 3],
[1, 2, 4.0, 5.0, "bah", 3],
[2, 3, 4.0, 5.0, "bah", 3],
],
columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
)
check(df, expected)
# delete (dup)
del df["foo"]
expected = DataFrame(
[[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
columns=["new_col", "new_col", "string", "foo2"],
)
assert_frame_equal(df, expected)
# dup across dtypes
df = DataFrame(
[[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
columns=["foo", "bar", "foo", "hello"],
)
check(df)
df["foo2"] = 7.0
expected = DataFrame(
[[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
columns=["foo", "bar", "foo", "hello", "foo2"],
)
check(df, expected)
result = df["foo"]
expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
check(result, expected)
# multiple replacements
df["foo"] = "string"
expected = DataFrame(
[
["string", 1, "string", 5, 7.0],
["string", 1, "string", 5, 7.0],
["string", 1, "string", 5, 7.0],
],
columns=["foo", "bar", "foo", "hello", "foo2"],
)
check(df, expected)
del df["foo"]
expected = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
)
check(df, expected)
# values
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
result = df.values
expected = np.array([[1, 2.5], [3, 4.5]])
assert (result == expected).all().all()
# rename, GH 4403
df4 = DataFrame(
{"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
index=MultiIndex.from_tuples(
[(600809, 20130331)], names=["STK_ID", "RPT_Date"]
),
)
df5 = DataFrame(
{
"RPT_Date": [20120930, 20121231, 20130331],
"STK_ID": [600809] * 3,
"STK_Name": ["饡驦", "饡驦", "饡驦"],
"TClose": [38.05, 41.66, 30.01],
},
index=MultiIndex.from_tuples(
[(600809, 20120930), (600809, 20121231), (600809, 20130331)],
names=["STK_ID", "RPT_Date"],
),
)
k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
str(result)
result.dtypes
expected = DataFrame(
[[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
columns=[
"RT",
"TClose",
"TExg",
"RPT_Date",
"STK_ID",
"STK_Name",
"QT_Close",
],
).set_index(["STK_ID", "RPT_Date"], drop=False)
assert_frame_equal(result, expected)
# reindex is invalid!
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df.reindex(columns=["bar"])
with pytest.raises(ValueError, match=msg):
df.reindex(columns=["bar", "foo"])
# drop
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
result = df.drop(["a"], axis=1)
expected = DataFrame([[1], [1], [1]], columns=["bar"])
check(result, expected)
result = df.drop("a", axis=1)
check(result, expected)
# describe
df = DataFrame(
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=["bar", "a", "a"],
dtype="float64",
)
result = df.describe()
s = df.iloc[:, 0].describe()
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
check(result, expected)
# check column dups with index equal and not equal to df's index
df = DataFrame(
np.random.randn(5, 3),
index=["a", "b", "c", "d", "e"],
columns=["A", "B", "A"],
)
for index in [df.index, pd.Index(list("edcba"))]:
this_df = df.copy()
expected_ser = pd.Series(index.values, index=this_df.index)
expected_df = DataFrame(
{"A": expected_ser, "B": this_df["B"], "A": expected_ser},
columns=["A", "B", "A"],
)
this_df["A"] = index
check(this_df, expected_df)
# operations
for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
expected = getattr(df, op)(df)
expected.columns = ["A", "A"]
df.columns = ["A", "A"]
result = getattr(df, op)(df)
check(result, expected)
# multiple assignments that change dtypes
# the location indexer is a slice
# GH 6120
df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
df["that"] = 1.0
check(df, expected)
df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
expected = DataFrame(1, index=range(5), columns=["that", "that"])
df["that"] = 1
check(df, expected)
def test_column_dups2(self):
# drop buggy GH 6240
df = DataFrame(
{
"A": np.random.randn(5),
"B": np.random.randn(5),
"C": np.random.randn(5),
"D": ["a", "b", "c", "d", "e"],
}
)
expected = df.take([0, 1, 1], axis=1)
df2 = df.take([2, 0, 1, 2, 1], axis=1)
result = df2.drop("C", axis=1)
assert_frame_equal(result, expected)
# dropna
df = DataFrame(
{
"A": np.random.randn(5),
"B": np.random.randn(5),
"C": np.random.randn(5),
"D": ["a", "b", "c", "d", "e"],
}
)
df.iloc[2, [0, 1, 2]] = np.nan
df.iloc[0, 0] = np.nan
df.iloc[1, 1] = np.nan
df.iloc[:, 3] = np.nan
expected = df.dropna(subset=["A", "B", "C"], how="all")
expected.columns = ["A", "A", "B", "C"]
df.columns = ["A", "A", "B", "C"]
result = df.dropna(subset=["A", "C"], how="all")
assert_frame_equal(result, expected)
def test_column_dups_indexing(self):
def check(result, expected=None):
if expected is not None:
assert_frame_equal(result, expected)
result.dtypes
str(result)
# boolean indexing
# GH 4879
dups = ["A", "A", "C", "D"]
df = DataFrame(
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
)
expected = df[df.C > 6]
expected.columns = dups
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
result = df[df.C > 6]
check(result, expected)
# where
df = DataFrame(
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
)
expected = df[df > 6]
expected.columns = dups
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
result = df[df > 6]
check(result, expected)
# boolean with the duplicate raises
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
msg = "cannot reindex from a duplicate axis"
with pytest.raises(ValueError, match=msg):
df[df.A > 6]
# dup aligning operations should work
# GH 5185
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
result = df1.sub(df2)
assert_frame_equal(result, expected)
# equality
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
# not-comparing like-labelled
msg = "Can only compare identically-labeled DataFrame objects"
with pytest.raises(ValueError, match=msg):
df1 == df2
df1r = df1.reindex_like(df2)
result = df1r == df2
expected = DataFrame(
[[False, True], [True, False], [False, False], [True, False]],
columns=["A", "A"],
)
assert_frame_equal(result, expected)
# mixed column selection
# GH 5639
dfbool = DataFrame(
{
"one": Series([True, True, False], index=["a", "b", "c"]),
"two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
"three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
}
)
expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
result = dfbool[["one", "three", "one"]]
check(result, expected)
# multi-axis dups
# GH 6121
df = DataFrame(
np.arange(25.0).reshape(5, 5),
index=["a", "b", "c", "d", "e"],
columns=["A", "B", "C", "D", "E"],
)
z = df[["A", "C", "A"]].copy()
expected = z.loc[["a", "c", "a"]]
df = DataFrame(
np.arange(25.0).reshape(5, 5),
index=["a", "b", "c", "d", "e"],
columns=["A", "B", "C", "D", "E"],
)
z = df[["A", "C", "A"]]
result = z.loc[["a", "c", "a"]]
check(result, expected)
def test_column_dups_indexing2(self):
# GH 8363
# datetime ops with a non-unique index
df = DataFrame(
{"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")},
index=[2, 2, 3, 3, 4],
)
result = df.B - df.A
expected = Series(1, index=[2, 2, 3, 3, 4])
assert_series_equal(result, expected)
df = DataFrame(
{
"A": date_range("20130101", periods=5),
"B": date_range("20130101 09:00:00", periods=5),
},
index=[2, 2, 3, 3, 4],
)
result = df.B - df.A
expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4])
assert_series_equal(result, expected)
def test_columns_with_dups(self):
# GH 3468 related
# basic
df = DataFrame([[1, 2]], columns=["a", "a"])
df.columns = ["a", "a.1"]
str(df)
expected = DataFrame([[1, 2]], columns=["a", "a.1"])
assert_frame_equal(df, expected)
df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
df.columns = ["b", "a", "a.1"]
str(df)
expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
assert_frame_equal(df, expected)
# with a dup index
df = DataFrame([[1, 2]], columns=["a", "a"])
df.columns = ["b", "b"]
str(df)
expected = DataFrame([[1, 2]], columns=["b", "b"])
assert_frame_equal(df, expected)
# multi-dtype
df = DataFrame(
[[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
columns=["a", "a", "b", "b", "d", "c", "c"],
)
df.columns = list("ABCDEFG")
str(df)
expected = DataFrame(
[[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
)
assert_frame_equal(df, expected)
df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
df.columns = ["a", "a.1", "a.2", "a.3"]
str(df)
expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
assert_frame_equal(df, expected)
# dups across blocks
df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
df_dt = DataFrame(
pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
)
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
assert len(df._data._blknos) == len(df.columns)
assert len(df._data._blklocs) == len(df.columns)
# testing iloc
for i in range(len(df.columns)):
df.iloc[:, i]
# dup columns across dtype GH 2079/2194
vals = [[1, -1, 2.0], [2, -2, 3.0]]
rs = DataFrame(vals, columns=["A", "A", "B"])
xp = DataFrame(vals)
xp.columns = ["A", "A", "B"]
assert_frame_equal(rs, xp)
def test_values_duplicates(self):
df = DataFrame(
[[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
)
result = df.values
expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_set_value_by_index(self):
# See gh-12344
df = DataFrame(np.arange(9).reshape(3, 3).T)
df.columns = list("AAA")
expected = df.iloc[:, 2]
df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 2], expected)
df = DataFrame(np.arange(9).reshape(3, 3).T)
df.columns = [2, float(2), str(2)]
expected = df.iloc[:, 1]
df.iloc[:, 0] = 3
assert_series_equal(df.iloc[:, 1], expected)
def test_insert_with_columns_dups(self):
# GH 14291
df = pd.DataFrame()
df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True)
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = pd.DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
assert_frame_equal(df, exp)

View File

@@ -0,0 +1,884 @@
from decimal import Decimal
import operator
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, MultiIndex, Series
import pandas.core.common as com
from pandas.tests.frame.common import _check_mixed_float
import pandas.util.testing as tm
from pandas.util.testing import (
assert_frame_equal,
assert_numpy_array_equal,
assert_series_equal,
)
class TestDataFrameUnaryOperators:
# __pos__, __neg__, __inv__
@pytest.mark.parametrize(
"df,expected",
[
(pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})),
(pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})),
(
pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}),
pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}),
),
],
)
def test_neg_numeric(self, df, expected):
assert_frame_equal(-df, expected)
assert_series_equal(-df["a"], expected["a"])
@pytest.mark.parametrize(
"df, expected",
[
(np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]),
],
)
def test_neg_object(self, df, expected):
# GH#21380
df = pd.DataFrame({"a": df})
expected = pd.DataFrame({"a": expected})
assert_frame_equal(-df, expected)
assert_series_equal(-df["a"], expected["a"])
@pytest.mark.parametrize(
"df",
[
pd.DataFrame({"a": ["a", "b"]}),
pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}),
],
)
def test_neg_raises(self, df):
with pytest.raises(TypeError):
(-df)
with pytest.raises(TypeError):
(-df["a"])
def test_invert(self, float_frame):
df = float_frame
assert_frame_equal(-(df < 0), ~(df < 0))
@pytest.mark.parametrize(
"df",
[
pd.DataFrame({"a": [-1, 1]}),
pd.DataFrame({"a": [False, True]}),
pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}),
],
)
def test_pos_numeric(self, df):
# GH#16073
assert_frame_equal(+df, df)
assert_series_equal(+df["a"], df["a"])
@pytest.mark.parametrize(
"df",
[
# numpy changing behavior in the future
pytest.param(
pd.DataFrame({"a": ["a", "b"]}),
marks=[pytest.mark.filterwarnings("ignore")],
),
pd.DataFrame({"a": np.array([-1, 2], dtype=object)}),
pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}),
],
)
def test_pos_object(self, df):
# GH#21380
assert_frame_equal(+df, df)
assert_series_equal(+df["a"], df["a"])
@pytest.mark.parametrize(
"df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})]
)
def test_pos_raises(self, df):
with pytest.raises(TypeError):
(+df)
with pytest.raises(TypeError):
(+df["a"])
class TestDataFrameLogicalOperators:
# &, |, ^
def test_logical_ops_empty_frame(self):
# GH#5808
# empty frames, non-mixed dtype
df = DataFrame(index=[1])
result = df & df
assert_frame_equal(result, df)
result = df | df
assert_frame_equal(result, df)
df2 = DataFrame(index=[1, 2])
result = df & df2
assert_frame_equal(result, df2)
dfa = DataFrame(index=[1], columns=["A"])
result = dfa & dfa
assert_frame_equal(result, dfa)
def test_logical_ops_bool_frame(self):
# GH#5808
df1a_bool = DataFrame(True, index=[1], columns=["A"])
result = df1a_bool & df1a_bool
assert_frame_equal(result, df1a_bool)
result = df1a_bool | df1a_bool
assert_frame_equal(result, df1a_bool)
def test_logical_ops_int_frame(self):
# GH#5808
df1a_int = DataFrame(1, index=[1], columns=["A"])
df1a_bool = DataFrame(True, index=[1], columns=["A"])
result = df1a_int | df1a_bool
assert_frame_equal(result, df1a_int)
def test_logical_ops_invalid(self):
# GH#5808
df1 = DataFrame(1.0, index=[1], columns=["A"])
df2 = DataFrame(True, index=[1], columns=["A"])
with pytest.raises(TypeError):
df1 | df2
df1 = DataFrame("foo", index=[1], columns=["A"])
df2 = DataFrame(True, index=[1], columns=["A"])
with pytest.raises(TypeError):
df1 | df2
def test_logical_operators(self):
def _check_bin_op(op):
result = op(df1, df2)
expected = DataFrame(
op(df1.values, df2.values), index=df1.index, columns=df1.columns
)
assert result.values.dtype == np.bool_
assert_frame_equal(result, expected)
def _check_unary_op(op):
result = op(df1)
expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns)
assert result.values.dtype == np.bool_
assert_frame_equal(result, expected)
df1 = {
"a": {"a": True, "b": False, "c": False, "d": True, "e": True},
"b": {"a": False, "b": True, "c": False, "d": False, "e": False},
"c": {"a": False, "b": False, "c": True, "d": False, "e": False},
"d": {"a": True, "b": False, "c": False, "d": True, "e": True},
"e": {"a": True, "b": False, "c": False, "d": True, "e": True},
}
df2 = {
"a": {"a": True, "b": False, "c": True, "d": False, "e": False},
"b": {"a": False, "b": True, "c": False, "d": False, "e": False},
"c": {"a": True, "b": False, "c": True, "d": False, "e": False},
"d": {"a": False, "b": False, "c": False, "d": True, "e": False},
"e": {"a": False, "b": False, "c": False, "d": False, "e": True},
}
df1 = DataFrame(df1)
df2 = DataFrame(df2)
_check_bin_op(operator.and_)
_check_bin_op(operator.or_)
_check_bin_op(operator.xor)
_check_unary_op(operator.inv) # TODO: belongs elsewhere
def test_logical_with_nas(self):
d = DataFrame({"a": [np.nan, False], "b": [True, True]})
# GH4947
# bool comparisons should return bool
result = d["a"] | d["b"]
expected = Series([False, True])
assert_series_equal(result, expected)
# GH4604, automatic casting here
result = d["a"].fillna(False) | d["b"]
expected = Series([True, True])
assert_series_equal(result, expected)
result = d["a"].fillna(False, downcast=False) | d["b"]
expected = Series([True, True])
assert_series_equal(result, expected)
class TestDataFrameOperators:
@pytest.mark.parametrize(
"op", [operator.add, operator.sub, operator.mul, operator.truediv]
)
def test_operators_none_as_na(self, op):
df = DataFrame(
{"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object
)
# since filling converts dtypes from object, changed expected to be
# object
filled = df.fillna(np.nan)
result = op(df, 3)
expected = op(filled, 3).astype(object)
expected[com.isna(expected)] = None
assert_frame_equal(result, expected)
result = op(df, df)
expected = op(filled, filled).astype(object)
expected[com.isna(expected)] = None
assert_frame_equal(result, expected)
result = op(df, df.fillna(7))
assert_frame_equal(result, expected)
result = op(df.fillna(7), df)
assert_frame_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)])
# TODO: not sure what's correct here.
@pytest.mark.filterwarnings("ignore:elementwise:FutureWarning")
def test_logical_typeerror_with_non_valid(self, op, res, float_frame):
# we are comparing floats vs a string
result = getattr(float_frame, op)("foo")
assert bool(result.all().all()) is res
def test_binary_ops_align(self):
# test aligning binary ops
# GH 6681
index = MultiIndex.from_product(
[list("abc"), ["one", "two", "three"], [1, 2, 3]],
names=["first", "second", "third"],
)
df = DataFrame(
np.arange(27 * 3).reshape(27, 3),
index=index,
columns=["value1", "value2", "value3"],
).sort_index()
idx = pd.IndexSlice
for op in ["add", "sub", "mul", "div", "truediv"]:
opa = getattr(operator, op, None)
if opa is None:
continue
x = Series([1.0, 10.0, 100.0], [1, 2, 3])
result = getattr(df, op)(x, level="third", axis=0)
expected = pd.concat(
[opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()]
).sort_index()
assert_frame_equal(result, expected)
x = Series([1.0, 10.0], ["two", "three"])
result = getattr(df, op)(x, level="second", axis=0)
expected = (
pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()])
.reindex_like(df)
.sort_index()
)
assert_frame_equal(result, expected)
# GH9463 (alignment level of dataframe with series)
midx = MultiIndex.from_product([["A", "B"], ["a", "b"]])
df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx)
s = pd.Series({"a": 1, "b": 2})
df2 = df.copy()
df2.columns.names = ["lvl0", "lvl1"]
s2 = s.copy()
s2.index.name = "lvl1"
# different cases of integer/string level names:
res1 = df.mul(s, axis=1, level=1)
res2 = df.mul(s2, axis=1, level=1)
res3 = df2.mul(s, axis=1, level=1)
res4 = df2.mul(s2, axis=1, level=1)
res5 = df2.mul(s, axis=1, level="lvl1")
res6 = df2.mul(s2, axis=1, level="lvl1")
exp = DataFrame(
np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx
)
for res in [res1, res2]:
assert_frame_equal(res, exp)
exp.columns.names = ["lvl0", "lvl1"]
for res in [res3, res4, res5, res6]:
assert_frame_equal(res, exp)
def test_dti_tz_convert_to_utc(self):
base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC")
idx1 = base.tz_convert("Asia/Tokyo")[:2]
idx2 = base.tz_convert("US/Eastern")[1:]
df1 = DataFrame({"A": [1, 2]}, index=idx1)
df2 = DataFrame({"A": [1, 1]}, index=idx2)
exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base)
assert_frame_equal(df1 + df2, exp)
def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame):
frame_copy = float_frame.reindex(float_frame.index[::2])
del frame_copy["D"]
frame_copy["C"][:5] = np.nan
added = float_frame + frame_copy
indexer = added["A"].dropna().index
exp = (float_frame["A"] * 2).copy()
tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer])
exp.loc[~exp.index.isin(indexer)] = np.nan
tm.assert_series_equal(added["A"], exp.loc[added["A"].index])
assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all()
# assert(False)
assert np.isnan(added["D"]).all()
self_added = float_frame + float_frame
tm.assert_index_equal(self_added.index, float_frame.index)
added_rev = frame_copy + float_frame
assert np.isnan(added["D"]).all()
assert np.isnan(added_rev["D"]).all()
# corner cases
# empty
plus_empty = float_frame + DataFrame()
assert np.isnan(plus_empty.values).all()
empty_plus = DataFrame() + float_frame
assert np.isnan(empty_plus.values).all()
empty_empty = DataFrame() + DataFrame()
assert empty_empty.empty
# out of order
reverse = float_frame.reindex(columns=float_frame.columns[::-1])
assert_frame_equal(reverse + float_frame, float_frame * 2)
# mix vs float64, upcast
added = float_frame + mixed_float_frame
_check_mixed_float(added, dtype="float64")
added = mixed_float_frame + float_frame
_check_mixed_float(added, dtype="float64")
# mix vs mix
added = mixed_float_frame + mixed_float_frame
_check_mixed_float(added, dtype=dict(C=None))
# with int
added = float_frame + mixed_int_frame
_check_mixed_float(added, dtype="float64")
def test_combineSeries(
self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame
):
# Series
series = float_frame.xs(float_frame.index[0])
added = float_frame + series
for key, s in added.items():
assert_series_equal(s, float_frame[key] + series[key])
larger_series = series.to_dict()
larger_series["E"] = 1
larger_series = Series(larger_series)
larger_added = float_frame + larger_series
for key, s in float_frame.items():
assert_series_equal(larger_added[key], s + series[key])
assert "E" in larger_added
assert np.isnan(larger_added["E"]).all()
# no upcast needed
added = mixed_float_frame + series
_check_mixed_float(added)
# vs mix (upcast) as needed
added = mixed_float_frame + series.astype("float32")
_check_mixed_float(added, dtype=dict(C=None))
added = mixed_float_frame + series.astype("float16")
_check_mixed_float(added, dtype=dict(C=None))
# these raise with numexpr.....as we are adding an int64 to an
# uint64....weird vs int
# added = mixed_int_frame + (100*series).astype('int64')
# _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
# 'int64', D = 'int64'))
# added = mixed_int_frame + (100*series).astype('int32')
# _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
# 'int32', D = 'int64'))
# TimeSeries
ts = datetime_frame["A"]
# 10890
# we no longer allow auto timeseries broadcasting
# and require explicit broadcasting
added = datetime_frame.add(ts, axis="index")
for key, col in datetime_frame.items():
result = col + ts
assert_series_equal(added[key], result, check_names=False)
assert added[key].name == key
if col.name == ts.name:
assert result.name == "A"
else:
assert result.name is None
smaller_frame = datetime_frame[:-5]
smaller_added = smaller_frame.add(ts, axis="index")
tm.assert_index_equal(smaller_added.index, datetime_frame.index)
smaller_ts = ts[:-5]
smaller_added2 = datetime_frame.add(smaller_ts, axis="index")
assert_frame_equal(smaller_added, smaller_added2)
# length 0, result is all-nan
result = datetime_frame.add(ts[:0], axis="index")
expected = DataFrame(
np.nan, index=datetime_frame.index, columns=datetime_frame.columns
)
assert_frame_equal(result, expected)
# Frame is all-nan
result = datetime_frame[:0].add(ts, axis="index")
expected = DataFrame(
np.nan, index=datetime_frame.index, columns=datetime_frame.columns
)
assert_frame_equal(result, expected)
# empty but with non-empty index
frame = datetime_frame[:1].reindex(columns=[])
result = frame.mul(ts, axis="index")
assert len(result) == len(ts)
def test_combineFunc(self, float_frame, mixed_float_frame):
result = float_frame * 2
tm.assert_numpy_array_equal(result.values, float_frame.values * 2)
# vs mix
result = mixed_float_frame * 2
for c, s in result.items():
tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2)
_check_mixed_float(result, dtype=dict(C=None))
result = DataFrame() * 2
assert result.index.equals(DataFrame().index)
assert len(result.columns) == 0
def test_comparisons(self, simple_frame, float_frame):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame()
row = simple_frame.xs("a")
ndim_5 = np.ones(df1.shape + (1, 1, 1))
def test_comp(func):
result = func(df1, df2)
tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values))
with pytest.raises(ValueError, match="dim must be <= 2"):
func(df1, ndim_5)
result2 = func(simple_frame, row)
tm.assert_numpy_array_equal(
result2.values, func(simple_frame.values, row.values)
)
result3 = func(float_frame, 0)
tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0))
msg = "Can only compare identically-labeled DataFrame"
with pytest.raises(ValueError, match=msg):
func(simple_frame, simple_frame[:2])
test_comp(operator.eq)
test_comp(operator.ne)
test_comp(operator.lt)
test_comp(operator.gt)
test_comp(operator.ge)
test_comp(operator.le)
def test_comparison_protected_from_errstate(self):
missing_df = tm.makeDataFrame()
missing_df.iloc[0]["A"] = np.nan
with np.errstate(invalid="ignore"):
expected = missing_df.values < 0
with np.errstate(invalid="raise"):
result = (missing_df < 0).values
tm.assert_numpy_array_equal(result, expected)
def test_boolean_comparison(self):
# GH 4576
# boolean comparisons with a tuple/list give unexpected results
df = DataFrame(np.arange(6).reshape((3, 2)))
b = np.array([2, 2])
b_r = np.atleast_2d([2, 2])
b_c = b_r.T
lst = [2, 2, 2]
tup = tuple(lst)
# gt
expected = DataFrame([[False, False], [False, True], [True, True]])
result = df > b
assert_frame_equal(result, expected)
result = df.values > b
assert_numpy_array_equal(result, expected.values)
msg1d = "Unable to coerce to Series, length must be 2: given 3"
msg2d = "Unable to coerce to DataFrame, shape must be"
msg2db = "operands could not be broadcast together with shapes"
with pytest.raises(ValueError, match=msg1d):
# wrong shape
df > lst
with pytest.raises(ValueError, match=msg1d):
# wrong shape
result = df > tup
# broadcasts like ndarray (GH#23000)
result = df > b_r
assert_frame_equal(result, expected)
result = df.values > b_r
assert_numpy_array_equal(result, expected.values)
with pytest.raises(ValueError, match=msg2d):
df > b_c
with pytest.raises(ValueError, match=msg2db):
df.values > b_c
# ==
expected = DataFrame([[False, False], [True, False], [False, False]])
result = df == b
assert_frame_equal(result, expected)
with pytest.raises(ValueError, match=msg1d):
result = df == lst
with pytest.raises(ValueError, match=msg1d):
result = df == tup
# broadcasts like ndarray (GH#23000)
result = df == b_r
assert_frame_equal(result, expected)
result = df.values == b_r
assert_numpy_array_equal(result, expected.values)
with pytest.raises(ValueError, match=msg2d):
df == b_c
assert df.values.shape != b_c.shape
# with alignment
df = DataFrame(
np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc")
)
expected.index = df.index
expected.columns = df.columns
with pytest.raises(ValueError, match=msg1d):
result = df == lst
with pytest.raises(ValueError, match=msg1d):
result = df == tup
def test_combine_generic(self, float_frame):
df1 = float_frame
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
combined = df1.combine(df2, np.add)
combined2 = df2.combine(df1, np.add)
assert combined["D"].isna().all()
assert combined2["D"].isna().all()
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
exp = (
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
* 2
)
assert_frame_equal(chunk, exp)
assert_frame_equal(chunk2, exp)
def test_inplace_ops_alignment(self):
# inplace ops / ops alignment
# GH 8511
columns = list("abcdefg")
X_orig = DataFrame(
np.arange(10 * len(columns)).reshape(-1, len(columns)),
columns=columns,
index=range(10),
)
Z = 100 * X_orig.iloc[:, 1:-1].copy()
block1 = list("bedcf")
subs = list("bcdef")
# add
X = X_orig.copy()
result1 = (X[block1] + Z).reindex(columns=subs)
X[block1] += Z
result2 = X.reindex(columns=subs)
X = X_orig.copy()
result3 = (X[block1] + Z[block1]).reindex(columns=subs)
X[block1] += Z[block1]
result4 = X.reindex(columns=subs)
assert_frame_equal(result1, result2)
assert_frame_equal(result1, result3)
assert_frame_equal(result1, result4)
# sub
X = X_orig.copy()
result1 = (X[block1] - Z).reindex(columns=subs)
X[block1] -= Z
result2 = X.reindex(columns=subs)
X = X_orig.copy()
result3 = (X[block1] - Z[block1]).reindex(columns=subs)
X[block1] -= Z[block1]
result4 = X.reindex(columns=subs)
assert_frame_equal(result1, result2)
assert_frame_equal(result1, result3)
assert_frame_equal(result1, result4)
def test_inplace_ops_identity(self):
# GH 5104
# make sure that we are actually changing the object
s_orig = Series([1, 2, 3])
df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))
# no dtype change
s = s_orig.copy()
s2 = s
s += 1
assert_series_equal(s, s2)
assert_series_equal(s_orig + 1, s)
assert s is s2
assert s._data is s2._data
df = df_orig.copy()
df2 = df
df += 1
assert_frame_equal(df, df2)
assert_frame_equal(df_orig + 1, df)
assert df is df2
assert df._data is df2._data
# dtype change
s = s_orig.copy()
s2 = s
s += 1.5
assert_series_equal(s, s2)
assert_series_equal(s_orig + 1.5, s)
df = df_orig.copy()
df2 = df
df += 1.5
assert_frame_equal(df, df2)
assert_frame_equal(df_orig + 1.5, df)
assert df is df2
assert df._data is df2._data
# mixed dtype
arr = np.random.randint(0, 10, size=5)
df_orig = DataFrame({"A": arr.copy(), "B": "foo"})
df = df_orig.copy()
df2 = df
df["A"] += 1
expected = DataFrame({"A": arr.copy() + 1, "B": "foo"})
assert_frame_equal(df, expected)
assert_frame_equal(df2, expected)
assert df._data is df2._data
df = df_orig.copy()
df2 = df
df["A"] += 1.5
expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"})
assert_frame_equal(df, expected)
assert_frame_equal(df2, expected)
assert df._data is df2._data
@pytest.mark.parametrize(
"op",
[
"add",
"and",
"div",
"floordiv",
"mod",
"mul",
"or",
"pow",
"sub",
"truediv",
"xor",
],
)
def test_inplace_ops_identity2(self, op):
if op == "div":
return
df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]})
operand = 2
if op in ("and", "or", "xor"):
# cannot use floats for boolean ops
df["a"] = [True, False, True]
df_copy = df.copy()
iop = "__i{}__".format(op)
op = "__{}__".format(op)
# no id change and value is correct
getattr(df, iop)(operand)
expected = getattr(df_copy, op)(operand)
assert_frame_equal(df, expected)
expected = id(df)
assert id(df) == expected
def test_alignment_non_pandas(self):
index = ["A", "B", "C"]
columns = ["X", "Y", "Z"]
df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)
align = pd.core.ops._align_method_FRAME
for val in [
[1, 2, 3],
(1, 2, 3),
np.array([1, 2, 3], dtype=np.int64),
range(1, 4),
]:
tm.assert_series_equal(
align(df, val, "index"), Series([1, 2, 3], index=df.index)
)
tm.assert_series_equal(
align(df, val, "columns"), Series([1, 2, 3], index=df.columns)
)
# length mismatch
msg = "Unable to coerce to Series, length must be 3: given 2"
for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]:
with pytest.raises(ValueError, match=msg):
align(df, val, "index")
with pytest.raises(ValueError, match=msg):
align(df, val, "columns")
val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
tm.assert_frame_equal(
align(df, val, "index"), DataFrame(val, index=df.index, columns=df.columns)
)
tm.assert_frame_equal(
align(df, val, "columns"),
DataFrame(val, index=df.index, columns=df.columns),
)
# shape mismatch
msg = "Unable to coerce to DataFrame, shape must be"
val = np.array([[1, 2, 3], [4, 5, 6]])
with pytest.raises(ValueError, match=msg):
align(df, val, "index")
with pytest.raises(ValueError, match=msg):
align(df, val, "columns")
val = np.zeros((3, 3, 3))
with pytest.raises(ValueError):
align(df, val, "index")
with pytest.raises(ValueError):
align(df, val, "columns")
def test_no_warning(self, all_arithmetic_operators):
df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]})
b = df["B"]
with tm.assert_produces_warning(None):
getattr(df, all_arithmetic_operators)(b, 0)
class TestTranspose:
def test_transpose_tzaware_1col_single_tz(self):
# GH#26825
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
df = pd.DataFrame(dti)
assert (df.dtypes == dti.dtype).all()
res = df.T
assert (res.dtypes == dti.dtype).all()
def test_transpose_tzaware_2col_single_tz(self):
# GH#26825
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
df3 = pd.DataFrame({"A": dti, "B": dti})
assert (df3.dtypes == dti.dtype).all()
res3 = df3.T
assert (res3.dtypes == dti.dtype).all()
def test_transpose_tzaware_2col_mixed_tz(self):
# GH#26825
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
dti2 = dti.tz_convert("US/Pacific")
df4 = pd.DataFrame({"A": dti, "B": dti2})
assert (df4.dtypes == [dti.dtype, dti2.dtype]).all()
assert (df4.T.dtypes == object).all()
tm.assert_frame_equal(df4.T.T, df4)
def test_transpose_object_to_tzaware_mixed_tz(self):
# GH#26825
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
dti2 = dti.tz_convert("US/Pacific")
# mixed all-tzaware dtypes
df2 = pd.DataFrame([dti, dti2])
assert (df2.dtypes == object).all()
res2 = df2.T
assert (res2.dtypes == [dti.dtype, dti2.dtype]).all()

View File

@@ -0,0 +1,156 @@
from datetime import timedelta
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
PeriodIndex,
Timedelta,
date_range,
period_range,
to_datetime,
)
import pandas.util.testing as tm
def _permute(obj):
return obj.take(np.random.permutation(len(obj)))
class TestPeriodIndex:
def test_as_frame_columns(self):
rng = period_range("1/1/2000", periods=5)
df = DataFrame(np.random.randn(10, 5), columns=rng)
ts = df[rng[0]]
tm.assert_series_equal(ts, df.iloc[:, 0])
# GH # 1211
repr(df)
ts = df["1/1/2000"]
tm.assert_series_equal(ts, df.iloc[:, 0])
def test_frame_setitem(self):
rng = period_range("1/1/2000", periods=5, name="index")
df = DataFrame(np.random.randn(5, 3), index=rng)
df["Index"] = rng
rs = Index(df["Index"])
tm.assert_index_equal(rs, rng, check_names=False)
assert rs.name == "Index"
assert rng.name == "index"
rs = df.reset_index().set_index("index")
assert isinstance(rs.index, PeriodIndex)
tm.assert_index_equal(rs.index, rng)
def test_frame_to_time_stamp(self):
K = 5
index = period_range(freq="A", start="1/1/2001", end="12/1/2009")
df = DataFrame(np.random.randn(len(index), K), index=index)
df["mix"] = "a"
exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC")
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
result = df.to_timestamp("D", "end")
tm.assert_index_equal(result.index, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN")
result = df.to_timestamp("D", "start")
tm.assert_index_equal(result.index, exp_index)
def _get_with_delta(delta, freq="A-DEC"):
return date_range(
to_datetime("1/1/2001") + delta,
to_datetime("12/31/2009") + delta,
freq=freq,
)
delta = timedelta(hours=23)
result = df.to_timestamp("H", "end")
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp("T", "end")
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
result = df.to_timestamp("S", "end")
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
# columns
df = df.T
exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC")
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
result = df.to_timestamp("D", "end", axis=1)
tm.assert_index_equal(result.columns, exp_index)
tm.assert_numpy_array_equal(result.values, df.values)
exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN")
result = df.to_timestamp("D", "start", axis=1)
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23)
result = df.to_timestamp("H", "end", axis=1)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
tm.assert_index_equal(result.columns, exp_index)
delta = timedelta(hours=23, minutes=59)
result = df.to_timestamp("T", "end", axis=1)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
tm.assert_index_equal(result.columns, exp_index)
result = df.to_timestamp("S", "end", axis=1)
delta = timedelta(hours=23, minutes=59, seconds=59)
exp_index = _get_with_delta(delta)
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
tm.assert_index_equal(result.columns, exp_index)
# invalid axis
with pytest.raises(ValueError, match="axis"):
df.to_timestamp(axis=2)
result1 = df.to_timestamp("5t", axis=1)
result2 = df.to_timestamp("t", axis=1)
expected = pd.date_range("2001-01-01", "2009-01-01", freq="AS")
assert isinstance(result1.columns, DatetimeIndex)
assert isinstance(result2.columns, DatetimeIndex)
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
# PeriodIndex.to_timestamp always use 'infer'
assert result1.columns.freqstr == "AS-JAN"
assert result2.columns.freqstr == "AS-JAN"
def test_frame_index_to_string(self):
index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M")
frame = DataFrame(np.random.randn(3, 4), index=index)
# it works!
frame.to_string()
def test_align_frame(self):
rng = period_range("1/1/2000", "1/1/2010", freq="A")
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
result = ts + ts[::2]
expected = ts + ts
expected.values[1::2] = np.nan
tm.assert_frame_equal(result, expected)
result = ts + _permute(ts[::2])
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,469 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameQuantile:
def test_quantile(self, datetime_frame):
from numpy import percentile
df = datetime_frame
q = df.quantile(0.1, axis=0)
assert q["A"] == percentile(df["A"], 10)
tm.assert_index_equal(q.index, df.columns)
q = df.quantile(0.9, axis=1)
assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90)
tm.assert_index_equal(q.index, df.index)
# test degenerate case
q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0)
assert np.isnan(q["x"]) and np.isnan(q["y"])
# non-numeric exclusion
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
rs = df.quantile(0.5)
xp = df.median().rename(0.5)
assert_series_equal(rs, xp)
# axis
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=1)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
assert_series_equal(result, expected)
result = df.quantile([0.5, 0.75], axis=1)
expected = DataFrame(
{1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
)
assert_frame_equal(result, expected, check_index_type=True)
# We may want to break API in the future to change this
# so that we exclude non-numeric along the same axis
# See GH #7312
df = DataFrame([[1, 2, 3], ["a", "b", 4]])
result = df.quantile(0.5, axis=1)
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
assert_series_equal(result, expected)
def test_quantile_axis_mixed(self):
# mixed on axis=1
df = DataFrame(
{
"A": [1, 2, 3],
"B": [2.0, 3.0, 4.0],
"C": pd.date_range("20130101", periods=3),
"D": ["foo", "bar", "baz"],
}
)
result = df.quantile(0.5, axis=1)
expected = Series([1.5, 2.5, 3.5], name=0.5)
assert_series_equal(result, expected)
# must raise
with pytest.raises(TypeError):
df.quantile(0.5, axis=1, numeric_only=False)
def test_quantile_axis_parameter(self):
# GH 9543/9544
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=0)
expected = Series([2.0, 3.0], index=["A", "B"], name=0.5)
assert_series_equal(result, expected)
expected = df.quantile(0.5, axis="index")
assert_series_equal(result, expected)
result = df.quantile(0.5, axis=1)
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
assert_series_equal(result, expected)
result = df.quantile(0.5, axis="columns")
assert_series_equal(result, expected)
msg = "No axis named -1 for object type <class 'pandas.core.frame.DataFrame'>"
with pytest.raises(ValueError, match=msg):
df.quantile(0.1, axis=-1)
msg = (
"No axis named column for object type"
" <class 'pandas.core.frame.DataFrame'>"
)
with pytest.raises(ValueError, match=msg):
df.quantile(0.1, axis="column")
def test_quantile_interpolation(self):
# see gh-10174
# interpolation method other than default linear
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=1, interpolation="nearest")
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
# cross-check interpolation=nearest results in original dtype
exp = np.percentile(
np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="nearest"
)
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64")
tm.assert_series_equal(result, expected)
# float
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3])
result = df.quantile(0.5, axis=1, interpolation="nearest")
expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5)
tm.assert_series_equal(result, expected)
exp = np.percentile(
np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
0.5,
axis=0,
interpolation="nearest",
)
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64")
assert_series_equal(result, expected)
# axis
result = df.quantile([0.5, 0.75], axis=1, interpolation="lower")
expected = DataFrame(
{1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75]
)
assert_frame_equal(result, expected)
# test degenerate case
df = DataFrame({"x": [], "y": []})
q = df.quantile(0.1, axis=0, interpolation="higher")
assert np.isnan(q["x"]) and np.isnan(q["y"])
# multi
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
result = df.quantile([0.25, 0.5], interpolation="midpoint")
# https://github.com/numpy/numpy/issues/7163
expected = DataFrame(
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
index=[0.25, 0.5],
columns=["a", "b", "c"],
)
assert_frame_equal(result, expected)
def test_quantile_interpolation_datetime(self, datetime_frame):
# see gh-10174
# interpolation = linear (default case)
df = datetime_frame
q = df.quantile(0.1, axis=0, interpolation="linear")
assert q["A"] == np.percentile(df["A"], 10)
def test_quantile_interpolation_int(self, int_frame):
# see gh-10174
df = int_frame
# interpolation = linear (default case)
q = df.quantile(0.1)
assert q["A"] == np.percentile(df["A"], 10)
# test with and without interpolation keyword
q1 = df.quantile(0.1, axis=0, interpolation="linear")
assert q1["A"] == np.percentile(df["A"], 10)
tm.assert_series_equal(q, q1)
def test_quantile_multi(self):
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
result = df.quantile([0.25, 0.5])
expected = DataFrame(
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
index=[0.25, 0.5],
columns=["a", "b", "c"],
)
assert_frame_equal(result, expected)
# axis = 1
result = df.quantile([0.25, 0.5], axis=1)
expected = DataFrame(
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2]
)
# empty
result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0)
expected = DataFrame(
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
)
assert_frame_equal(result, expected)
def test_quantile_datetime(self):
df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]})
# exclude datetime
result = df.quantile(0.5)
expected = Series([2.5], index=["b"])
# datetime
result = df.quantile(0.5, numeric_only=False)
expected = Series(
[Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
)
assert_series_equal(result, expected)
# datetime w/ multi
result = df.quantile([0.5], numeric_only=False)
expected = DataFrame(
[[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"]
)
assert_frame_equal(result, expected)
# axis = 1
df["c"] = pd.to_datetime(["2011", "2012"])
result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
expected = Series(
[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
index=[0, 1],
name=0.5,
)
assert_series_equal(result, expected)
result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
expected = DataFrame(
[[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
index=[0.5],
columns=[0, 1],
)
assert_frame_equal(result, expected)
# empty when numeric_only=True
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# result = df[['a', 'c']].quantile(.5)
# result = df[['a', 'c']].quantile([.5])
def test_quantile_invalid(self, datetime_frame):
msg = "percentiles should all be in the interval \\[0, 1\\]"
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
with pytest.raises(ValueError, match=msg):
datetime_frame.quantile(invalid)
def test_quantile_box(self):
df = DataFrame(
{
"A": [
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-03"),
],
"B": [
pd.Timestamp("2011-01-01", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timestamp("2011-01-03", tz="US/Eastern"),
],
"C": [
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
],
}
)
res = df.quantile(0.5, numeric_only=False)
exp = pd.Series(
[
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
],
name=0.5,
index=["A", "B", "C"],
)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = pd.DataFrame(
[
[
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
]
],
index=[0.5],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(res, exp)
# DatetimeBlock may be consolidated and contain NaT in different loc
df = DataFrame(
{
"A": [
pd.Timestamp("2011-01-01"),
pd.NaT,
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-03"),
],
"a": [
pd.Timestamp("2011-01-01"),
pd.Timestamp("2011-01-02"),
pd.NaT,
pd.Timestamp("2011-01-03"),
],
"B": [
pd.Timestamp("2011-01-01", tz="US/Eastern"),
pd.NaT,
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timestamp("2011-01-03", tz="US/Eastern"),
],
"b": [
pd.Timestamp("2011-01-01", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.NaT,
pd.Timestamp("2011-01-03", tz="US/Eastern"),
],
"C": [
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
pd.NaT,
],
"c": [
pd.NaT,
pd.Timedelta("1 days"),
pd.Timedelta("2 days"),
pd.Timedelta("3 days"),
],
},
columns=list("AaBbCc"),
)
res = df.quantile(0.5, numeric_only=False)
exp = pd.Series(
[
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
pd.Timedelta("2 days"),
],
name=0.5,
index=list("AaBbCc"),
)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = pd.DataFrame(
[
[
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-02"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timestamp("2011-01-02", tz="US/Eastern"),
pd.Timedelta("2 days"),
pd.Timedelta("2 days"),
]
],
index=[0.5],
columns=list("AaBbCc"),
)
tm.assert_frame_equal(res, exp)
def test_quantile_nan(self):
# GH 14357 - float block where some cols have missing values
df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
df.iloc[-1, 1] = np.nan
res = df.quantile(0.5)
exp = Series([3.0, 2.5], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75])
exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
res = df.quantile(0.5, axis=1)
exp = Series(np.arange(1.0, 6.0), name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75], axis=1)
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
# full-nan column
df["b"] = np.nan
res = df.quantile(0.5)
exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5, 0.75])
exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)
def test_quantile_nat(self):
# full NaT column
df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]})
res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.NaT], index=["a"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame({"a": [pd.NaT]}, index=[0.5])
tm.assert_frame_equal(res, exp)
# mixed non-null / full null column
df = DataFrame(
{
"a": [
pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-02"),
pd.Timestamp("2012-01-03"),
],
"b": [pd.NaT, pd.NaT, pd.NaT],
}
)
res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5], numeric_only=False)
exp = DataFrame(
[[pd.Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"]
)
tm.assert_frame_equal(res, exp)
def test_quantile_empty(self):
# floats
df = DataFrame(columns=["a", "b"], dtype="float64")
res = df.quantile(0.5)
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
tm.assert_series_equal(res, exp)
res = df.quantile([0.5])
exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
tm.assert_frame_equal(res, exp)
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5, axis=1)
# res = df.quantile([0.5], axis=1)
# ints
df = DataFrame(columns=["a", "b"], dtype="int64")
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5)
# datetimes
df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
# res = df.quantile(0.5, numeric_only=False)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,316 @@
from datetime import datetime, timedelta
import numpy as np
import pytest
from pandas import DataFrame, Series
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
class TestRank:
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
df = DataFrame({"A": s, "B": s})
results = {
"average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
"min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
"max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
"first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
"dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
}
@pytest.fixture(params=["average", "min", "max", "first", "dense"])
def method(self, request):
"""
Fixture for trying all rank methods
"""
return request.param
def test_rank(self, float_frame):
rankdata = pytest.importorskip("scipy.stats.rankdata")
float_frame["A"][::2] = np.nan
float_frame["B"][::3] = np.nan
float_frame["C"][::4] = np.nan
float_frame["D"][::5] = np.nan
ranks0 = float_frame.rank()
ranks1 = float_frame.rank(1)
mask = np.isnan(float_frame.values)
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp0[mask] = np.nan
exp1 = np.apply_along_axis(rankdata, 1, fvals)
exp1[mask] = np.nan
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# integers
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
result = df.rank()
exp = df.astype(float).rank()
tm.assert_frame_equal(result, exp)
result = df.rank(1)
exp = df.astype(float).rank(1)
tm.assert_frame_equal(result, exp)
def test_rank2(self):
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
result = df.rank(1, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([[1, 3, 2], [1, 2, 3]])
expected = df.rank(0) / 2.0
result = df.rank(0, pct=True)
tm.assert_frame_equal(result, expected)
df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
result = df.rank(1, numeric_only=False)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
result = df.rank(0, numeric_only=False)
tm.assert_frame_equal(result, expected)
# f7u12, this does not work without extensive workaround
data = [
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
]
df = DataFrame(data)
# check the rank
expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
result = df.rank(1, numeric_only=False, ascending=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
result = df.rank(1, numeric_only=False, ascending=False)
tm.assert_frame_equal(result, expected)
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
tm.assert_frame_equal(df.rank(), exp)
def test_rank_mixed_frame(self, float_string_frame):
float_string_frame["datetime"] = datetime.now()
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
result = float_string_frame.rank(1)
expected = float_string_frame.rank(1, numeric_only=True)
tm.assert_frame_equal(result, expected)
def test_rank_na_option(self, float_frame):
rankdata = pytest.importorskip("scipy.stats.rankdata")
float_frame["A"][::2] = np.nan
float_frame["B"][::3] = np.nan
float_frame["C"][::4] = np.nan
float_frame["D"][::5] = np.nan
# bottom
ranks0 = float_frame.rank(na_option="bottom")
ranks1 = float_frame.rank(1, na_option="bottom")
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp1 = np.apply_along_axis(rankdata, 1, fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# top
ranks0 = float_frame.rank(na_option="top")
ranks1 = float_frame.rank(1, na_option="top")
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
fval1 = float_frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, fval0)
exp1 = np.apply_along_axis(rankdata, 1, fval1)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# bottom
ranks0 = float_frame.rank(na_option="top", ascending=False)
ranks1 = float_frame.rank(1, na_option="top", ascending=False)
fvals = float_frame.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
tm.assert_almost_equal(ranks0.values, exp0)
tm.assert_almost_equal(ranks1.values, exp1)
# descending
# top
ranks0 = float_frame.rank(na_option="bottom", ascending=False)
ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
fval1 = float_frame.T
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
fval1 = fval1.fillna(np.inf).values
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
tm.assert_numpy_array_equal(ranks0.values, exp0)
tm.assert_numpy_array_equal(ranks1.values, exp1)
# bad values throw error
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
float_frame.rank(na_option="bad", ascending=False)
# invalid type
with pytest.raises(ValueError, match=msg):
float_frame.rank(na_option=True, ascending=False)
def test_rank_axis(self):
# check if using axes' names gives the same result
df = DataFrame([[2, 1], [4, 3]])
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))
def test_rank_methods_frame(self):
pytest.importorskip("scipy.stats.special")
rankdata = pytest.importorskip("scipy.stats.rankdata")
xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord("z") - i) for i in range(xs.shape[1])]
for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)
for ax in [0, 1]:
for m in ["average", "min", "max", "first", "dense"]:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals, m if m != "first" else "ordinal"
)
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols).astype("float64")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
def test_rank_descending(self, method, dtype):
if "i" in dtype:
df = self.df.dropna()
else:
df = self.df.astype(dtype)
res = df.rank(ascending=False)
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)
if method == "first" and dtype == "O":
return
expected = (df.max() - df).rank(method=method)
if dtype != "O":
res2 = df.rank(method=method, ascending=False, numeric_only=True)
assert_frame_equal(res2, expected)
res3 = df.rank(method=method, ascending=False, numeric_only=False)
assert_frame_equal(res3, expected)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("dtype", [None, object])
def test_rank_2d_tie_methods(self, method, axis, dtype):
df = self.df
def _check2d(df, expected, method="average", axis=0):
exp_df = DataFrame({"A": expected, "B": expected})
if axis == 1:
df = df.T
exp_df = exp_df.T
result = df.rank(method=method, axis=axis)
assert_frame_equal(result, exp_df)
disabled = {(object, "first")}
if (dtype, method) in disabled:
return
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, self.results[method], method=method, axis=axis)
@pytest.mark.parametrize(
"method,exp",
[
("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]),
(
"min",
[
[1.0 / 3, 1.0, 1.0],
[1.0 / 3, 1.0 / 3, 2.0 / 3],
[1.0 / 3, 1.0 / 3, 1.0 / 3],
],
),
(
"max",
[[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]],
),
(
"average",
[[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]],
),
(
"first",
[
[1.0 / 3, 1.0, 1.0],
[2.0 / 3, 1.0 / 3, 2.0 / 3],
[3.0 / 3, 2.0 / 3, 1.0 / 3],
],
),
],
)
def test_rank_pct_true(self, method, exp):
# see gh-15630.
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)
expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
@pytest.mark.single
@pytest.mark.high_memory
def test_pct_max_many_rows(self):
# GH 18271
df = DataFrame(
{"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)}
)
result = df.rank(pct=True).max()
assert (result == 1).all()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,549 @@
from datetime import datetime, timedelta
from io import StringIO
import re
import sys
import textwrap
import numpy as np
import pytest
from pandas.compat import PYPY
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
date_range,
option_context,
period_range,
)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
import pandas.io.formats.format as fmt
# Segregated collection of methods that require the BlockManager internal data
# structure
class TestDataFrameReprInfoEtc(TestData):
def test_repr_empty(self):
# empty
foo = repr(self.empty) # noqa
# empty with index
frame = DataFrame(index=np.arange(1000))
foo = repr(frame) # noqa
def test_repr_mixed(self):
buf = StringIO()
# mixed
foo = repr(self.mixed_frame) # noqa
self.mixed_frame.info(verbose=False, buf=buf)
@pytest.mark.slow
def test_repr_mixed_big(self):
# big mixed
biggie = DataFrame(
{"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
)
biggie.loc[:20, "A"] = np.nan
biggie.loc[:20, "B"] = np.nan
foo = repr(biggie) # noqa
def test_repr(self):
buf = StringIO()
# small one
foo = repr(self.frame)
self.frame.info(verbose=False, buf=buf)
# even smaller
self.frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
self.frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
# exhausting cases in DataFrame.info
# columns but no index
no_index = DataFrame(columns=[0, 1, 3])
foo = repr(no_index) # noqa
# no columns or index
self.empty.info(buf=buf)
df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
assert "\t" not in repr(df)
assert "\r" not in repr(df)
assert "a\n" not in repr(df)
def test_repr_dimensions(self):
df = DataFrame([[1, 2], [3, 4]])
with option_context("display.show_dimensions", True):
assert "2 rows x 2 columns" in repr(df)
with option_context("display.show_dimensions", False):
assert "2 rows x 2 columns" not in repr(df)
with option_context("display.show_dimensions", "truncate"):
assert "2 rows x 2 columns" not in repr(df)
@pytest.mark.slow
def test_repr_big(self):
# big one
biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
repr(biggie)
def test_repr_unsortable(self):
# columns are not sortable
import warnings
warn_filters = warnings.filters
warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")
unsortable = DataFrame(
{
"foo": [1] * 50,
datetime.today(): [1] * 50,
"bar": ["bar"] * 50,
datetime.today() + timedelta(1): ["bar"] * 50,
},
index=np.arange(50),
)
repr(unsortable)
fmt.set_option("display.precision", 3, "display.column_space", 10)
repr(self.frame)
fmt.set_option("display.max_rows", 10, "display.max_columns", 2)
repr(self.frame)
fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000)
repr(self.frame)
tm.reset_display_options()
warnings.filters = warn_filters
def test_repr_unicode(self):
uval = "\u03c3\u03c3\u03c3\u03c3"
# TODO(wesm): is this supposed to be used?
bval = uval.encode("utf-8") # noqa
df = DataFrame({"A": [uval, uval]})
result = repr(df)
ex_top = " A"
assert result.split("\n")[0].rstrip() == ex_top
df = DataFrame({"A": [uval, uval]})
result = repr(df)
assert result.split("\n")[0].rstrip() == ex_top
def test_unicode_string_with_unicode(self):
df = DataFrame({"A": ["\u05d0"]})
str(df)
def test_str_to_bytes_raises(self):
# GH 26447
df = DataFrame({"A": ["abc"]})
msg = "^'str' object cannot be interpreted as an integer$"
with pytest.raises(TypeError, match=msg):
bytes(df)
def test_very_wide_info_repr(self):
df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
repr(df)
def test_repr_column_name_unicode_truncation_bug(self):
# #1906
df = DataFrame(
{
"Id": [7117434],
"StringCol": (
"Is it possible to modify drop plot code"
" so that the output graph is displayed "
"in iphone simulator, Is it possible to "
"modify drop plot code so that the "
"output graph is \xe2\x80\xa8displayed "
"in iphone simulator.Now we are adding "
"the CSV file externally. I want to Call"
" the File through the code.."
),
}
)
with option_context("display.max_columns", 20):
assert "StringCol" in repr(df)
def test_latex_repr(self):
result = r"""\begin{tabular}{llll}
\toprule
{} & 0 & 1 & 2 \\
\midrule
0 & $\alpha$ & b & c \\
1 & 1 & 2 & 3 \\
\bottomrule
\end{tabular}
"""
with option_context("display.latex.escape", False, "display.latex.repr", True):
df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
assert result == df._repr_latex_()
# GH 12182
assert df._repr_latex_() is None
def test_info(self):
io = StringIO()
self.frame.info(buf=io)
self.tsframe.info(buf=io)
frame = DataFrame(np.random.randn(5, 3))
frame.info()
frame.info(verbose=False)
def test_info_memory(self):
# https://github.com/pandas-dev/pandas/issues/21056
df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent(
"""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
a 2 non-null int64
dtypes: int64(1)
memory usage: {} bytes
""".format(
bytes
)
)
assert result == expected
def test_info_wide(self):
from pandas import set_option, reset_option
io = StringIO()
df = DataFrame(np.random.randn(5, 101))
df.info(buf=io)
io = StringIO()
df.info(buf=io, max_cols=101)
rs = io.getvalue()
assert len(rs.splitlines()) > 100
xp = rs
set_option("display.max_info_columns", 101)
io = StringIO()
df.info(buf=io)
assert rs == xp
reset_option("display.max_info_columns")
def test_info_duplicate_columns(self):
io = StringIO()
# it works!
frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
frame.info(buf=io)
def test_info_duplicate_columns_shows_correct_dtypes(self):
# GH11761
io = StringIO()
frame = DataFrame([[1, 2.0]], columns=["a", "a"])
frame.info(buf=io)
io.seek(0)
lines = io.readlines()
assert "a 1 non-null int64\n" == lines[3]
assert "a 1 non-null float64\n" == lines[4]
def test_info_shows_column_dtypes(self):
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
for i, dtype in enumerate(dtypes):
name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype)
assert name in res
def test_info_max_cols(self):
df = DataFrame(np.random.randn(10, 5))
for len_, verbose in [(5, None), (5, False), (10, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, verbose in [(10, None), (5, False), (10, True)]:
# max_cols no exceeded
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, max_cols in [(10, 5), (5, 4)]:
# setting truncates
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
# setting wouldn't truncate
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
def test_info_memory_usage(self):
# Ensure memory usage is displayed, when asserted, on the last line
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
# display memory usage case
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert "memory usage: " in res[-1]
# do not display memory usage case
df.info(buf=buf, memory_usage=False)
res = buf.getvalue().splitlines()
assert "memory usage: " not in res[-1]
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
assert re.match(r"memory usage: [^+]+\+", res[-1])
df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])
# Test a DataFrame with duplicate columns
dtypes = ["int64", "int64", "int64", "float64"]
data = {}
n = 100
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
df.columns = dtypes
df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
df_with_object_index.info(buf=buf, memory_usage="deep")
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])
# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 + df.index.nbytes
assert df_size == exp_size
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) + 1 # index=True; default
assert size_df == np.size(df.memory_usage())
# assert deep works only on object
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
# test for validity
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product([["a"], range(1000)]),
columns=["A"],
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes
mem = df.memory_usage(deep=True).sum()
assert mem > 0
@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy(self):
df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
> df_with_object_index.memory_usage(index=True).sum()
)
df_object = pd.DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy(self):
df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
== df_with_object_index.memory_usage(index=True).sum()
)
df_object = pd.DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof(self):
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product([["a"], range(1000)]),
columns=["A"],
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100
def test_info_memory_usage_qualified(self):
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
df.info(buf=buf)
assert "+" in buf.getvalue()
buf = StringIO()
df = DataFrame(
1,
columns=list("ab"),
index=pd.MultiIndex.from_product([range(3), range(3)]),
)
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(
1,
columns=list("ab"),
index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]),
)
df.info(buf=buf)
assert "+" in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex(self):
# GH 14308
# memory usage introspection should not materialize .values
from string import ascii_uppercase as uppercase
def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
M = len(uppercase)
index = pd.MultiIndex.from_product(
[list(uppercase), pd.date_range("20160101", periods=N)],
names=["id", "date"],
)
df = DataFrame({"value": np.random.randn(N * M)}, index=index)
unstacked = df.unstack("id")
assert df.values.nbytes == unstacked.values.nbytes
assert memory_usage(df) > memory_usage(unstacked)
# high upper bound
assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_categorical(self):
# GH14298
idx = pd.CategoricalIndex(["a", "b"])
df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
buf = StringIO()
df.info(buf=buf)
def test_info_categorical_column(self):
# make sure it works
n = 2500
df = DataFrame({"int64": np.random.randint(100, size=n)})
df["category"] = Series(
np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
).astype("category")
df.isna()
buf = StringIO()
df.info(buf=buf)
df2 = df[df["category"] == "d"]
buf = StringIO()
df2.info(buf=buf)
def test_repr_categorical_dates_periods(self):
# normal DataFrame
dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
p = period_range("2011-01", freq="M", periods=5)
df = DataFrame({"dt": dt, "p": p})
exp = """ dt p
0 2011-01-01 09:00:00-05:00 2011-01
1 2011-01-01 10:00:00-05:00 2011-02
2 2011-01-01 11:00:00-05:00 2011-03
3 2011-01-01 12:00:00-05:00 2011-04
4 2011-01-01 13:00:00-05:00 2011-05"""
assert repr(df) == exp
df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
assert repr(df2) == exp
@pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
@pytest.mark.parametrize(
"box, expected",
[[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]],
)
def test_repr_np_nat_with_object(self, arg, box, expected):
# GH 25445
result = repr(box([arg("NaT")], dtype=object))
assert result == expected

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,93 @@
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
from pandas import DataFrame
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal
@pytest.fixture
def df_none():
return DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 2, 2, 1, 1],
"A": np.arange(6, 0, -1),
("B", 5): ["one", "one", "two", "two", "one", "one"],
}
)
@pytest.fixture(params=[["outer"], ["outer", "inner"]])
def df_idx(request, df_none):
levels = request.param
return df_none.set_index(levels)
@pytest.fixture(
params=[
"inner", # index level
["outer"], # list of index level
"A", # column
[("B", 5)], # list of column
["inner", "outer"], # two index levels
[("B", 5), "outer"], # index level and column
["A", ("B", 5)], # Two columns
["inner", "outer"], # two index levels and column
]
)
def sort_names(request):
return request.param
@pytest.fixture(params=[True, False])
def ascending(request):
return request.param
def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending):
# GH 14353
# Get index levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on columns and the setting index
expected = df_none.sort_values(
by=sort_names, ascending=ascending, axis=0
).set_index(levels)
# Compute result sorting on mix on columns and index levels
result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0)
assert_frame_equal(result, expected)
def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending):
# GH 14353
# Get levels from df_idx
levels = df_idx.index.names
# Compute expected by sorting on axis=0, setting index levels, and then
# transposing. For some cases this will result in a frame with
# multiple column levels
expected = (
df_none.sort_values(by=sort_names, ascending=ascending, axis=0)
.set_index(levels)
.T
)
# Compute result by transposing and sorting on axis=1.
result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1)
if len(levels) > 1:
# Accessing multi-level columns that are not lexsorted raises a
# performance warning
with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False):
assert_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)

View File

@@ -0,0 +1,739 @@
import random
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
IntervalIndex,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
)
from pandas.api.types import CategoricalDtype
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestDataFrameSorting(TestData):
def test_sort_values(self):
frame = DataFrame(
[[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC")
)
# by column (axis=0)
sorted_df = frame.sort_values(by="A")
indexer = frame["A"].argsort().values
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by="A", ascending=False)
indexer = indexer[::-1]
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by="A", ascending=False)
assert_frame_equal(sorted_df, expected)
# GH4839
sorted_df = frame.sort_values(by=["A"], ascending=[False])
assert_frame_equal(sorted_df, expected)
# multiple bys
sorted_df = frame.sort_values(by=["B", "C"])
expected = frame.loc[[2, 1, 3]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
assert_frame_equal(sorted_df, expected[::-1])
sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
assert_frame_equal(sorted_df, expected)
msg = "No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>"
with pytest.raises(ValueError, match=msg):
frame.sort_values(by=["A", "B"], axis=2, inplace=True)
# by row (axis=1): GH 10806
sorted_df = frame.sort_values(by=3, axis=1)
expected = frame
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
expected = frame.reindex(columns=["C", "B", "A"])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 2], axis="columns")
expected = frame.reindex(columns=["B", "A", "C"])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
expected = frame.reindex(columns=["C", "B", "A"])
assert_frame_equal(sorted_df, expected)
msg = r"Length of ascending \(5\) != length of by \(2\)"
with pytest.raises(ValueError, match=msg):
frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
def test_sort_values_inplace(self):
frame = DataFrame(
np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]
)
sorted_df = frame.copy()
sorted_df.sort_values(by="A", inplace=True)
expected = frame.sort_values(by="A")
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=1, axis=1, inplace=True)
expected = frame.sort_values(by=1, axis=1)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by="A", ascending=False, inplace=True)
expected = frame.sort_values(by="A", ascending=False)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True)
expected = frame.sort_values(by=["A", "B"], ascending=False)
assert_frame_equal(sorted_df, expected)
def test_sort_nan(self):
# GH3917
nan = np.nan
df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
# sort one column only
expected = DataFrame(
{"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5],
)
sorted_df = df.sort_values(["A"], na_position="first")
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3],
)
sorted_df = df.sort_values(["A"], na_position="first", ascending=False)
assert_frame_equal(sorted_df, expected)
expected = df.reindex(columns=["B", "A"])
sorted_df = df.sort_values(by=1, axis=1, na_position="first")
assert_frame_equal(sorted_df, expected)
# na_position='last', order
expected = DataFrame(
{"A": [1, 1, 2, 4, 6, 8, nan], "B": [2, 9, nan, 5, 5, 4, 5]},
index=[3, 0, 1, 6, 4, 5, 2],
)
sorted_df = df.sort_values(["A", "B"])
assert_frame_equal(sorted_df, expected)
# na_position='first', order
expected = DataFrame(
{"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, nan, 5, 5, 4]},
index=[2, 3, 0, 1, 6, 4, 5],
)
sorted_df = df.sort_values(["A", "B"], na_position="first")
assert_frame_equal(sorted_df, expected)
# na_position='first', not order
expected = DataFrame(
{"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5],
)
sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first")
assert_frame_equal(sorted_df, expected)
# na_position='last', not order
expected = DataFrame(
{"A": [8, 6, 4, 2, 1, 1, nan], "B": [4, 5, 5, nan, 2, 9, 5]},
index=[5, 4, 6, 1, 3, 0, 2],
)
sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last")
assert_frame_equal(sorted_df, expected)
# Test DataFrame with nan label
df = DataFrame(
{"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan],
)
# NaN label, ascending=True, na_position='last'
sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last")
expected = DataFrame(
{"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan],
)
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=True, na_position='first'
sorted_df = df.sort_index(na_position="first")
expected = DataFrame(
{"A": [4, 1, 2, nan, 1, 6, 8], "B": [5, 9, nan, 5, 2, 5, 4]},
index=[nan, 1, 2, 3, 4, 5, 6],
)
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='last'
sorted_df = df.sort_index(kind="quicksort", ascending=False)
expected = DataFrame(
{"A": [8, 6, 1, nan, 2, 1, 4], "B": [4, 5, 2, 5, nan, 9, 5]},
index=[6, 5, 4, 3, 2, 1, nan],
)
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='first'
sorted_df = df.sort_index(
kind="quicksort", ascending=False, na_position="first"
)
expected = DataFrame(
{"A": [4, 8, 6, 1, nan, 2, 1], "B": [5, 4, 5, 2, 5, nan, 9]},
index=[nan, 6, 5, 4, 3, 2, 1],
)
assert_frame_equal(sorted_df, expected)
def test_stable_descending_sort(self):
# GH #6399
df = DataFrame(
[[2, "first"], [2, "second"], [1, "a"], [1, "b"]],
columns=["sort_col", "order"],
)
sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
assert_frame_equal(df, sorted_df)
def test_stable_descending_multicolumn_sort(self):
nan = np.nan
df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
# test stable mergesort
expected = DataFrame(
{"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]},
index=[2, 5, 4, 6, 1, 3, 0],
)
sorted_df = df.sort_values(
["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort"
)
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3],
)
sorted_df = df.sort_values(
["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort"
)
assert_frame_equal(sorted_df, expected)
def test_sort_multi_index(self):
# GH 25775, testing that sorting by index works with a multi-index.
df = DataFrame(
{"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")}
)
result = df.set_index(list("abc")).sort_index(level=list("ba"))
expected = DataFrame(
{"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")}
)
expected = expected.set_index(list("abc"))
tm.assert_frame_equal(result, expected)
def test_stable_categorial(self):
# GH 16793
df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)})
expected = df.copy()
sorted_df = df.sort_values("x", kind="mergesort")
assert_frame_equal(sorted_df, expected)
def test_sort_datetimes(self):
# GH 3461, argsort / lexsort differences for a datetime column
df = DataFrame(
["a", "a", "a", "b", "c", "d", "e", "f", "g"],
columns=["A"],
index=date_range("20130101", periods=9),
)
dts = [
Timestamp(x)
for x in [
"2004-02-11",
"2004-01-21",
"2004-01-26",
"2005-09-20",
"2010-10-04",
"2009-05-12",
"2008-11-12",
"2010-09-28",
"2010-09-28",
]
]
df["B"] = dts[::2] + dts[1::2]
df["C"] = 2.0
df["A1"] = 3.0
df1 = df.sort_values(by="A")
df2 = df.sort_values(by=["A"])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by="B")
df2 = df.sort_values(by=["B"])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by="B")
df2 = df.sort_values(by=["C", "B"])
assert_frame_equal(df1, df2)
def test_frame_column_inplace_sort_exception(self):
s = self.frame["A"]
with pytest.raises(ValueError, match="This Series is a view"):
s.sort_values(inplace=True)
cp = s.copy()
cp.sort_values() # it works!
def test_sort_nat_values_in_int_column(self):
# GH 14922: "sorting with large float and multiple columns incorrect"
# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.
int_values = (2, int(NaT))
float_values = (2.0, -1.797693e308)
df = DataFrame(
dict(int=int_values, float=float_values), columns=["int", "float"]
)
df_reversed = DataFrame(
dict(int=int_values[::-1], float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0],
)
# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["int", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(
dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values),
columns=["datetime", "float"],
)
df_reversed = DataFrame(
dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0],
)
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
assert_frame_equal(df_sorted, df)
# Ascending should not affect the results.
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
def test_sort_nat(self):
# GH 16836
d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]]
d2 = [
Timestamp(x)
for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"]
]
df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3])
d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]]
d4 = [
Timestamp(x)
for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"]
]
expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2])
sorted_df = df.sort_values(by=["a", "b"])
tm.assert_frame_equal(sorted_df, expected)
class TestDataFrameSortIndexKinds(TestData):
def test_sort_index_multicolumn(self):
A = np.arange(5).repeat(20)
B = np.tile(np.arange(5), 20)
random.shuffle(A)
random.shuffle(B)
frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=["A", "B"])
result = frame.sort_values(by=["A", "B"])
indexer = np.lexsort((frame["B"], frame["A"]))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=["A", "B"], ascending=False)
result = frame.sort_values(by=["A", "B"], ascending=False)
indexer = np.lexsort(
(frame["B"].rank(ascending=False), frame["A"].rank(ascending=False))
)
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=["B", "A"])
result = frame.sort_values(by=["B", "A"])
indexer = np.lexsort((frame["A"], frame["B"]))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
def test_sort_index_inplace(self):
frame = DataFrame(
np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]
)
# axis=0
unordered = frame.loc[[3, 2, 4, 1]]
a_id = id(unordered["A"])
df = unordered.copy()
df.sort_index(inplace=True)
expected = frame
assert_frame_equal(df, expected)
assert a_id != id(df["A"])
df = unordered.copy()
df.sort_index(ascending=False, inplace=True)
expected = frame[::-1]
assert_frame_equal(df, expected)
# axis=1
unordered = frame.loc[:, ["D", "B", "C", "A"]]
df = unordered.copy()
df.sort_index(axis=1, inplace=True)
expected = frame
assert_frame_equal(df, expected)
df = unordered.copy()
df.sort_index(axis=1, ascending=False, inplace=True)
expected = frame.iloc[:, ::-1]
assert_frame_equal(df, expected)
def test_sort_index_different_sortorder(self):
A = np.arange(20).repeat(5)
B = np.tile(np.arange(5), 20)
indexer = np.random.permutation(100)
A = A.take(indexer)
B = B.take(indexer)
df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=["A", "B"], ascending=[1, 0])
result = df.sort_values(by=["A", "B"], ascending=[1, 0])
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
expected = df.take(ex_indexer)
assert_frame_equal(result, expected)
# test with multiindex, too
idf = df.set_index(["A", "B"])
result = idf.sort_index(ascending=[1, 0])
expected = idf.take(ex_indexer)
assert_frame_equal(result, expected)
# also, Series!
result = idf["C"].sort_index(ascending=[1, 0])
assert_series_equal(result, expected["C"])
def test_sort_index_duplicates(self):
# with 9816, these are all translated to .sort_values
df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"])
with pytest.raises(ValueError, match="not unique"):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by="a")
with pytest.raises(ValueError, match="not unique"):
df.sort_values(by="a")
with pytest.raises(ValueError, match="not unique"):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=["a"])
with pytest.raises(ValueError, match="not unique"):
df.sort_values(by=["a"])
with pytest.raises(ValueError, match="not unique"):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
# multi-column 'by' is separate codepath
df.sort_index(by=["a", "b"])
with pytest.raises(ValueError, match="not unique"):
# multi-column 'by' is separate codepath
df.sort_values(by=["a", "b"])
# with multi-index
# GH4370
df = DataFrame(
np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)])
)
with pytest.raises(ValueError, match="level"):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by="a")
with pytest.raises(ValueError, match="level"):
df.sort_values(by="a")
# convert tuples to a list of tuples
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=[("a", 1)])
expected = df.sort_values(by=[("a", 1)])
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=("a", 1))
result = df.sort_values(by=("a", 1))
assert_frame_equal(result, expected)
def test_sort_index_level(self):
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
df = DataFrame([[1, 2], [3, 4]], mi)
result = df.sort_index(level="A", sort_remaining=False)
expected = df
assert_frame_equal(result, expected)
result = df.sort_index(level=["A", "B"], sort_remaining=False)
expected = df
assert_frame_equal(result, expected)
# Error thrown by sort_index when
# first index is sorted last (#26053)
result = df.sort_index(level=["C", "B", "A"])
expected = df.iloc[[1, 0]]
assert_frame_equal(result, expected)
result = df.sort_index(level=["B", "C", "A"])
expected = df.iloc[[1, 0]]
assert_frame_equal(result, expected)
result = df.sort_index(level=["C", "A"])
expected = df.iloc[[1, 0]]
assert_frame_equal(result, expected)
def test_sort_index_categorical_index(self):
df = DataFrame(
{
"A": np.arange(6, dtype="int64"),
"B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))),
}
).set_index("B")
result = df.sort_index()
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
assert_frame_equal(result, expected)
result = df.sort_index(ascending=False)
expected = df.iloc[[2, 3, 0, 1, 5, 4]]
assert_frame_equal(result, expected)
def test_sort_index(self):
# GH13496
frame = DataFrame(
np.arange(16).reshape(4, 4),
index=[1, 2, 3, 4],
columns=["A", "B", "C", "D"],
)
# axis=0 : sort rows by index labels
unordered = frame.loc[[3, 2, 4, 1]]
result = unordered.sort_index(axis=0)
expected = frame
assert_frame_equal(result, expected)
result = unordered.sort_index(ascending=False)
expected = frame[::-1]
assert_frame_equal(result, expected)
# axis=1 : sort columns by column names
unordered = frame.iloc[:, [2, 1, 3, 0]]
result = unordered.sort_index(axis=1)
assert_frame_equal(result, frame)
result = unordered.sort_index(axis=1, ascending=False)
expected = frame.iloc[:, ::-1]
assert_frame_equal(result, expected)
@pytest.mark.parametrize("level", ["A", 0]) # GH 21052
def test_sort_index_multiindex(self, level):
# GH13496
# sort rows by specified level of multi-index
mi = MultiIndex.from_tuples(
[[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC")
)
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
expected_mi = MultiIndex.from_tuples(
[[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC")
)
expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi)
result = df.sort_index(level=level)
assert_frame_equal(result, expected)
# sort_remaining=False
expected_mi = MultiIndex.from_tuples(
[[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC")
)
expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi)
result = df.sort_index(level=level, sort_remaining=False)
assert_frame_equal(result, expected)
def test_sort_index_intervalindex(self):
# this is a de-facto sort via unstack
# confirming that we sort in the order of the bins
y = Series(np.random.randn(100))
x1 = Series(np.sign(np.random.randn(100)))
x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"])
result = model.groupby(["X1", "X2"], observed=True).mean().unstack()
expected = IntervalIndex.from_tuples(
[(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right"
)
result = result.columns.levels[1].categories
tm.assert_index_equal(result, expected)
def test_sort_index_na_position_with_categories(self):
# GH 22556
# Positioning missing value properly when column is Categorical.
categories = ["A", "B", "C"]
category_indices = [0, 2, 4]
list_of_nans = [np.nan, np.nan]
na_indices = [1, 3]
na_position_first = "first"
na_position_last = "last"
column_name = "c"
reversed_categories = sorted(categories, reverse=True)
reversed_category_indices = sorted(category_indices, reverse=True)
reversed_na_indices = sorted(na_indices)
df = pd.DataFrame(
{
column_name: pd.Categorical(
["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True
)
}
)
# sort ascending with na first
result = df.sort_values(
by=column_name, ascending=True, na_position=na_position_first
)
expected = DataFrame(
{
column_name: Categorical(
list_of_nans + categories, categories=categories, ordered=True
)
},
index=na_indices + category_indices,
)
assert_frame_equal(result, expected)
# sort ascending with na last
result = df.sort_values(
by=column_name, ascending=True, na_position=na_position_last
)
expected = DataFrame(
{
column_name: Categorical(
categories + list_of_nans, categories=categories, ordered=True
)
},
index=category_indices + na_indices,
)
assert_frame_equal(result, expected)
# sort descending with na first
result = df.sort_values(
by=column_name, ascending=False, na_position=na_position_first
)
expected = DataFrame(
{
column_name: Categorical(
list_of_nans + reversed_categories,
categories=categories,
ordered=True,
)
},
index=reversed_na_indices + reversed_category_indices,
)
assert_frame_equal(result, expected)
# sort descending with na last
result = df.sort_values(
by=column_name, ascending=False, na_position=na_position_last
)
expected = DataFrame(
{
column_name: Categorical(
reversed_categories + list_of_nans,
categories=categories,
ordered=True,
)
},
index=reversed_category_indices + reversed_na_indices,
)
assert_frame_equal(result, expected)
def test_sort_index_na_position_with_categories_raises(self):
df = pd.DataFrame(
{
"c": pd.Categorical(
["A", np.nan, "B", np.nan, "C"],
categories=["A", "B", "C"],
ordered=True,
)
}
)
with pytest.raises(ValueError):
df.sort_values(by="c", ascending=False, na_position="bad_position")

View File

@@ -0,0 +1,592 @@
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
class TestDataFrameSubclassing(TestData):
def test_frame_subclassing_and_slicing(self):
# Subclass frame and ensure it returns the right class on slicing it
# In reference to PR 9632
class CustomSeries(Series):
@property
def _constructor(self):
return CustomSeries
def custom_series_function(self):
return "OK"
class CustomDataFrame(DataFrame):
"""
Subclasses pandas DF, fills DF with simulation results, adds some
custom plotting functions.
"""
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
@property
def _constructor(self):
return CustomDataFrame
_constructor_sliced = CustomSeries
def custom_frame_function(self):
return "OK"
data = {"col1": range(10), "col2": range(10)}
cdf = CustomDataFrame(data)
# Did we get back our own DF class?
assert isinstance(cdf, CustomDataFrame)
# Do we get back our own Series class after selecting a column?
cdf_series = cdf.col1
assert isinstance(cdf_series, CustomSeries)
assert cdf_series.custom_series_function() == "OK"
# Do we get back our own DF class after slicing row-wise?
cdf_rows = cdf[1:5]
assert isinstance(cdf_rows, CustomDataFrame)
assert cdf_rows.custom_frame_function() == "OK"
# Make sure sliced part of multi-index frame is custom class
mcol = pd.MultiIndex.from_tuples([("A", "A"), ("A", "B")])
cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
assert isinstance(cdf_multi["A"], CustomDataFrame)
mcol = pd.MultiIndex.from_tuples([("A", ""), ("B", "")])
cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
assert isinstance(cdf_multi2["A"], CustomSeries)
def test_dataframe_metadata(self):
df = tm.SubclassedDataFrame(
{"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"]
)
df.testattr = "XXX"
assert df.testattr == "XXX"
assert df[["X"]].testattr == "XXX"
assert df.loc[["a", "b"], :].testattr == "XXX"
assert df.iloc[[0, 1], :].testattr == "XXX"
# see gh-9776
assert df.iloc[0:1, :].testattr == "XXX"
# see gh-10553
unpickled = tm.round_trip_pickle(df)
tm.assert_frame_equal(df, unpickled)
assert df._metadata == unpickled._metadata
assert df.testattr == unpickled.testattr
def test_indexing_sliced(self):
# GH 11559
df = tm.SubclassedDataFrame(
{"X": [1, 2, 3], "Y": [4, 5, 6], "Z": [7, 8, 9]}, index=["a", "b", "c"]
)
res = df.loc[:, "X"]
exp = tm.SubclassedSeries([1, 2, 3], index=list("abc"), name="X")
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.iloc[:, 1]
exp = tm.SubclassedSeries([4, 5, 6], index=list("abc"), name="Y")
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc[:, "Z"]
exp = tm.SubclassedSeries([7, 8, 9], index=list("abc"), name="Z")
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc["a", :]
exp = tm.SubclassedSeries([1, 4, 7], index=list("XYZ"), name="a")
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.iloc[1, :]
exp = tm.SubclassedSeries([2, 5, 8], index=list("XYZ"), name="b")
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
res = df.loc["c", :]
exp = tm.SubclassedSeries([3, 6, 9], index=list("XYZ"), name="c")
tm.assert_series_equal(res, exp)
assert isinstance(res, tm.SubclassedSeries)
def test_subclass_attr_err_propagation(self):
# GH 11808
class A(DataFrame):
@property
def bar(self):
return self.i_dont_exist
with pytest.raises(AttributeError, match=".*i_dont_exist.*"):
A().bar
def test_subclass_align(self):
# GH 12983
df1 = tm.SubclassedDataFrame(
{"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")
)
df2 = tm.SubclassedDataFrame(
{"c": [1, 2, 4], "d": [1, 2, 4]}, index=list("ABD")
)
res1, res2 = df1.align(df2, axis=0)
exp1 = tm.SubclassedDataFrame(
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
index=list("ABCDE"),
)
exp2 = tm.SubclassedDataFrame(
{"c": [1, 2, np.nan, 4, np.nan], "d": [1, 2, np.nan, 4, np.nan]},
index=list("ABCDE"),
)
assert isinstance(res1, tm.SubclassedDataFrame)
tm.assert_frame_equal(res1, exp1)
assert isinstance(res2, tm.SubclassedDataFrame)
tm.assert_frame_equal(res2, exp2)
res1, res2 = df1.a.align(df2.c)
assert isinstance(res1, tm.SubclassedSeries)
tm.assert_series_equal(res1, exp1.a)
assert isinstance(res2, tm.SubclassedSeries)
tm.assert_series_equal(res2, exp2.c)
def test_subclass_align_combinations(self):
# GH 12983
df = tm.SubclassedDataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
s = tm.SubclassedSeries([1, 2, 4], index=list("ABD"), name="x")
# frame + series
res1, res2 = df.align(s, axis=0)
exp1 = pd.DataFrame(
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
index=list("ABCDE"),
)
# name is lost when
exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
assert isinstance(res1, tm.SubclassedDataFrame)
tm.assert_frame_equal(res1, exp1)
assert isinstance(res2, tm.SubclassedSeries)
tm.assert_series_equal(res2, exp2)
# series + frame
res1, res2 = s.align(df)
assert isinstance(res1, tm.SubclassedSeries)
tm.assert_series_equal(res1, exp2)
assert isinstance(res2, tm.SubclassedDataFrame)
tm.assert_frame_equal(res2, exp1)
def test_subclass_iterrows(self):
# GH 13977
df = tm.SubclassedDataFrame({"a": [1]})
for i, row in df.iterrows():
assert isinstance(row, tm.SubclassedSeries)
tm.assert_series_equal(row, df.loc[i])
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
def test_subclass_sparse_slice(self):
rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
ssdf = tm.SubclassedSparseDataFrame(rows)
ssdf.testattr = "testattr"
tm.assert_sp_frame_equal(ssdf.loc[:2], tm.SubclassedSparseDataFrame(rows[:3]))
tm.assert_sp_frame_equal(ssdf.iloc[:2], tm.SubclassedSparseDataFrame(rows[:2]))
tm.assert_sp_frame_equal(ssdf[:2], tm.SubclassedSparseDataFrame(rows[:2]))
assert ssdf.loc[:2].testattr == "testattr"
assert ssdf.iloc[:2].testattr == "testattr"
assert ssdf[:2].testattr == "testattr"
tm.assert_sp_series_equal(
ssdf.loc[1],
tm.SubclassedSparseSeries(rows[1]),
check_names=False,
check_kind=False,
)
tm.assert_sp_series_equal(
ssdf.iloc[1],
tm.SubclassedSparseSeries(rows[1]),
check_names=False,
check_kind=False,
)
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
def test_subclass_sparse_transpose(self):
ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], [4, 5, 6]])
essdf = tm.SubclassedSparseDataFrame([[1, 4], [2, 5], [3, 6]])
tm.assert_sp_frame_equal(ossdf.T, essdf)
def test_subclass_stack(self):
# GH 15564
df = tm.SubclassedDataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["a", "b", "c"],
columns=["X", "Y", "Z"],
)
res = df.stack()
exp = tm.SubclassedSeries(
[1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")]
)
tm.assert_series_equal(res, exp)
def test_subclass_stack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame(
[[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
),
columns=MultiIndex.from_tuples(
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
),
)
exp = tm.SubclassedDataFrame(
[
[10, 12],
[11, 13],
[20, 22],
[21, 23],
[30, 32],
[31, 33],
[40, 42],
[41, 43],
],
index=MultiIndex.from_tuples(
list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))),
names=["aaa", "ccc", "yyy"],
),
columns=Index(["W", "X"], name="www"),
)
res = df.stack()
tm.assert_frame_equal(res, exp)
res = df.stack("yyy")
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame(
[
[10, 11],
[12, 13],
[20, 21],
[22, 23],
[30, 31],
[32, 33],
[40, 41],
[42, 43],
],
index=MultiIndex.from_tuples(
list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))),
names=["aaa", "ccc", "www"],
),
columns=Index(["y", "z"], name="yyy"),
)
res = df.stack("www")
tm.assert_frame_equal(res, exp)
def test_subclass_stack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame(
[
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0],
],
index=MultiIndex.from_tuples(
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
),
columns=MultiIndex.from_tuples(
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
),
)
exp = tm.SubclassedDataFrame(
[
[10, 12.0],
[11, 13.0],
[20, 22.0],
[21, 23.0],
[30, 32.0],
[31, 33.0],
[40, 42.0],
[41, 43.0],
],
index=MultiIndex.from_tuples(
list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))),
names=["aaa", "ccc", "yyy"],
),
columns=Index(["W", "X"], name="www"),
)
res = df.stack()
tm.assert_frame_equal(res, exp)
res = df.stack("yyy")
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame(
[
[10.0, 11.0],
[12.0, 13.0],
[20.0, 21.0],
[22.0, 23.0],
[30.0, 31.0],
[32.0, 33.0],
[40.0, 41.0],
[42.0, 43.0],
],
index=MultiIndex.from_tuples(
list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))),
names=["aaa", "ccc", "www"],
),
columns=Index(["y", "z"], name="yyy"),
)
res = df.stack("www")
tm.assert_frame_equal(res, exp)
def test_subclass_unstack(self):
# GH 15564
df = tm.SubclassedDataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["a", "b", "c"],
columns=["X", "Y", "Z"],
)
res = df.unstack()
exp = tm.SubclassedSeries(
[1, 4, 7, 2, 5, 8, 3, 6, 9], index=[list("XXXYYYZZZ"), list("abcabcabc")]
)
tm.assert_series_equal(res, exp)
def test_subclass_unstack_multi(self):
# GH 15564
df = tm.SubclassedDataFrame(
[[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
index=MultiIndex.from_tuples(
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
),
columns=MultiIndex.from_tuples(
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
),
)
exp = tm.SubclassedDataFrame(
[[10, 20, 11, 21, 12, 22, 13, 23], [30, 40, 31, 41, 32, 42, 33, 43]],
index=Index(["A", "B"], name="aaa"),
columns=MultiIndex.from_tuples(
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))),
names=["www", "yyy", "ccc"],
),
)
res = df.unstack()
tm.assert_frame_equal(res, exp)
res = df.unstack("ccc")
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame(
[[10, 30, 11, 31, 12, 32, 13, 33], [20, 40, 21, 41, 22, 42, 23, 43]],
index=Index(["c", "d"], name="ccc"),
columns=MultiIndex.from_tuples(
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))),
names=["www", "yyy", "aaa"],
),
)
res = df.unstack("aaa")
tm.assert_frame_equal(res, exp)
def test_subclass_unstack_multi_mixed(self):
# GH 15564
df = tm.SubclassedDataFrame(
[
[10, 11, 12.0, 13.0],
[20, 21, 22.0, 23.0],
[30, 31, 32.0, 33.0],
[40, 41, 42.0, 43.0],
],
index=MultiIndex.from_tuples(
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
),
columns=MultiIndex.from_tuples(
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
),
)
exp = tm.SubclassedDataFrame(
[
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0],
],
index=Index(["A", "B"], name="aaa"),
columns=MultiIndex.from_tuples(
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))),
names=["www", "yyy", "ccc"],
),
)
res = df.unstack()
tm.assert_frame_equal(res, exp)
res = df.unstack("ccc")
tm.assert_frame_equal(res, exp)
exp = tm.SubclassedDataFrame(
[
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0],
],
index=Index(["c", "d"], name="ccc"),
columns=MultiIndex.from_tuples(
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))),
names=["www", "yyy", "aaa"],
),
)
res = df.unstack("aaa")
tm.assert_frame_equal(res, exp)
def test_subclass_pivot(self):
# GH 15564
df = tm.SubclassedDataFrame(
{
"index": ["A", "B", "C", "C", "B", "A"],
"columns": ["One", "One", "One", "Two", "Two", "Two"],
"values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
}
)
pivoted = df.pivot(index="index", columns="columns", values="values")
expected = tm.SubclassedDataFrame(
{
"One": {"A": 1.0, "B": 2.0, "C": 3.0},
"Two": {"A": 1.0, "B": 2.0, "C": 3.0},
}
)
expected.index.name, expected.columns.name = "index", "columns"
tm.assert_frame_equal(pivoted, expected)
def test_subclassed_melt(self):
# GH 15564
cheese = tm.SubclassedDataFrame(
{
"first": ["John", "Mary"],
"last": ["Doe", "Bo"],
"height": [5.5, 6.0],
"weight": [130, 150],
}
)
melted = pd.melt(cheese, id_vars=["first", "last"])
expected = tm.SubclassedDataFrame(
[
["John", "Doe", "height", 5.5],
["Mary", "Bo", "height", 6.0],
["John", "Doe", "weight", 130],
["Mary", "Bo", "weight", 150],
],
columns=["first", "last", "variable", "value"],
)
tm.assert_frame_equal(melted, expected)
def test_subclassed_wide_to_long(self):
# GH 9762
np.random.seed(123)
x = np.random.randn(3)
df = tm.SubclassedDataFrame(
{
"A1970": {0: "a", 1: "b", 2: "c"},
"A1980": {0: "d", 1: "e", 2: "f"},
"B1970": {0: 2.5, 1: 1.2, 2: 0.7},
"B1980": {0: 3.2, 1: 1.3, 2: 0.1},
"X": dict(zip(range(3), x)),
}
)
df["id"] = df.index
exp_data = {
"X": x.tolist() + x.tolist(),
"A": ["a", "b", "c", "d", "e", "f"],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2],
}
expected = tm.SubclassedDataFrame(exp_data)
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(long_frame, expected)
def test_subclassed_apply(self):
# GH 19822
def check_row_subclass(row):
assert isinstance(row, tm.SubclassedSeries)
def strech(row):
if row["variable"] == "height":
row["value"] += 0.5
return row
df = tm.SubclassedDataFrame(
[
["John", "Doe", "height", 5.5],
["Mary", "Bo", "height", 6.0],
["John", "Doe", "weight", 130],
["Mary", "Bo", "weight", 150],
],
columns=["first", "last", "variable", "value"],
)
df.apply(lambda x: check_row_subclass(x))
df.apply(lambda x: check_row_subclass(x), axis=1)
expected = tm.SubclassedDataFrame(
[
["John", "Doe", "height", 6.0],
["Mary", "Bo", "height", 6.5],
["John", "Doe", "weight", 130],
["Mary", "Bo", "weight", 150],
],
columns=["first", "last", "variable", "value"],
)
result = df.apply(lambda x: strech(x), axis=1)
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
expected = tm.SubclassedDataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
assert isinstance(result, tm.SubclassedDataFrame)
tm.assert_frame_equal(result, expected)
expected = tm.SubclassedSeries([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
result = df.apply(lambda x: [1, 2, 3], axis=1)
assert not isinstance(result, tm.SubclassedDataFrame)
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,969 @@
from datetime import datetime, time
from itertools import product
import numpy as np
import pytest
import pytz
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
period_range,
to_datetime,
)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm
from pandas.util.testing import (
assert_frame_equal,
assert_index_equal,
assert_series_equal,
)
import pandas.tseries.offsets as offsets
@pytest.fixture(params=product([True, False], [True, False]))
def close_open_fixture(request):
return request.param
class TestDataFrameTimeSeriesMethods(TestData):
def test_diff(self):
the_diff = self.tsframe.diff(1)
assert_series_equal(
the_diff["A"], self.tsframe["A"] - self.tsframe["A"].shift(1)
)
# int dtype
a = 10000000000000000
b = a + 1
s = Series([a, b])
rs = DataFrame({"s": s}).diff()
assert rs.s[1] == 1
# mixed numeric
tf = self.tsframe.astype("float32")
the_diff = tf.diff(1)
assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
# issue 10907
df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])})
df.insert(0, "x", 1)
result = df.diff(axis=1)
expected = pd.DataFrame(
{"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}
).astype("float64")
assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis0(self, tz):
# GH 18578
df = DataFrame(
{
0: date_range("2010", freq="D", periods=2, tz=tz),
1: date_range("2010", freq="D", periods=2, tz=tz),
}
)
result = df.diff(axis=0)
expected = DataFrame(
{
0: pd.TimedeltaIndex(["NaT", "1 days"]),
1: pd.TimedeltaIndex(["NaT", "1 days"]),
}
)
assert_frame_equal(result, expected)
@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis1(self, tz):
# GH 18578
df = DataFrame(
{
0: date_range("2010", freq="D", periods=2, tz=tz),
1: date_range("2010", freq="D", periods=2, tz=tz),
}
)
if tz is None:
result = df.diff(axis=1)
expected = DataFrame(
{
0: pd.TimedeltaIndex(["NaT", "NaT"]),
1: pd.TimedeltaIndex(["0 days", "0 days"]),
}
)
assert_frame_equal(result, expected)
else:
with pytest.raises(NotImplementedError):
result = df.diff(axis=1)
def test_diff_timedelta(self):
# GH 4533
df = DataFrame(
dict(
time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
value=[1.0, 2.0],
)
)
res = df.diff()
exp = DataFrame(
[[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
)
assert_frame_equal(res, exp)
def test_diff_mixed_dtype(self):
df = DataFrame(np.random.randn(5, 3))
df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
result = df.diff()
assert result[0].dtype == np.float64
def test_diff_neg_n(self):
rs = self.tsframe.diff(-1)
xp = self.tsframe - self.tsframe.shift(-1)
assert_frame_equal(rs, xp)
def test_diff_float_n(self):
rs = self.tsframe.diff(1.0)
xp = self.tsframe.diff(1)
assert_frame_equal(rs, xp)
def test_diff_axis(self):
# GH 9727
df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
assert_frame_equal(df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]))
assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]))
def test_pct_change(self):
rs = self.tsframe.pct_change(fill_method=None)
assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
rs = self.tsframe.pct_change(2)
filled = self.tsframe.fillna(method="pad")
assert_frame_equal(rs, filled / filled.shift(2) - 1)
rs = self.tsframe.pct_change(fill_method="bfill", limit=1)
filled = self.tsframe.fillna(method="bfill", limit=1)
assert_frame_equal(rs, filled / filled.shift(1) - 1)
rs = self.tsframe.pct_change(freq="5D")
filled = self.tsframe.fillna(method="pad")
assert_frame_equal(
rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled)
)
def test_pct_change_shift_over_nas(self):
s = Series([1.0, 1.5, np.nan, 2.5, 3.0])
df = DataFrame({"a": s, "b": s})
chg = df.pct_change()
expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2])
edf = DataFrame({"a": expected, "b": expected})
assert_frame_equal(chg, edf)
@pytest.mark.parametrize(
"freq, periods, fill_method, limit",
[
("5B", 5, None, None),
("3B", 3, None, None),
("3B", 3, "bfill", None),
("7B", 7, "pad", 1),
("7B", 7, "bfill", 3),
("14B", 14, None, None),
],
)
def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
# GH 7292
rs_freq = self.tsframe.pct_change(
freq=freq, fill_method=fill_method, limit=limit
)
rs_periods = self.tsframe.pct_change(
periods, fill_method=fill_method, limit=limit
)
assert_frame_equal(rs_freq, rs_periods)
empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns)
rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit)
rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit)
assert_frame_equal(rs_freq, rs_periods)
def test_frame_ctor_datetime64_column(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
dates = np.asarray(rng)
df = DataFrame({"A": np.random.randn(len(rng)), "B": dates})
assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]"))
def test_frame_append_datetime64_column(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
df = DataFrame(index=np.arange(len(rng)))
df["A"] = rng
assert np.issubdtype(df["A"].dtype, np.dtype("M8[ns]"))
def test_frame_datetime64_pre1900_repr(self):
df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")})
# it works!
repr(df)
def test_frame_append_datetime64_col_other_units(self):
n = 100
units = ["h", "m", "s", "ms", "D", "M", "Y"]
ns_dtype = np.dtype("M8[ns]")
for unit in units:
dtype = np.dtype("M8[{unit}]".format(unit=unit))
vals = np.arange(n, dtype=np.int64).view(dtype)
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
df[unit] = vals
ex_vals = to_datetime(vals.astype("O")).values
assert df[unit].dtype == ns_dtype
assert (df[unit].values == ex_vals).all()
# Test insertion into existing datetime64 column
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype)
for unit in units:
dtype = np.dtype("M8[{unit}]".format(unit=unit))
vals = np.arange(n, dtype=np.int64).view(dtype)
tmp = df.copy()
tmp["dates"] = vals
ex_vals = to_datetime(vals.astype("O")).values
assert (tmp["dates"].values == ex_vals).all()
def test_shift(self):
# naive shift
shiftedFrame = self.tsframe.shift(5)
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
shiftedSeries = self.tsframe["A"].shift(5)
assert_series_equal(shiftedFrame["A"], shiftedSeries)
shiftedFrame = self.tsframe.shift(-5)
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
shiftedSeries = self.tsframe["A"].shift(-5)
assert_series_equal(shiftedFrame["A"], shiftedSeries)
# shift by 0
unshifted = self.tsframe.shift(0)
assert_frame_equal(unshifted, self.tsframe)
# shift by DateOffset
shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
assert len(shiftedFrame) == len(self.tsframe)
shiftedFrame2 = self.tsframe.shift(5, freq="B")
assert_frame_equal(shiftedFrame, shiftedFrame2)
d = self.tsframe.index[0]
shifted_d = d + offsets.BDay(5)
assert_series_equal(
self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False
)
# shift int frame
int_shifted = self.intframe.shift(1) # noqa
# Shifting with PeriodIndex
ps = tm.makePeriodFrame()
shifted = ps.shift(1)
unshifted = shifted.shift(-1)
tm.assert_index_equal(shifted.index, ps.index)
tm.assert_index_equal(unshifted.index, ps.index)
tm.assert_numpy_array_equal(
unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values
)
shifted2 = ps.shift(1, "B")
shifted3 = ps.shift(1, offsets.BDay())
assert_frame_equal(shifted2, shifted3)
assert_frame_equal(ps, shifted2.shift(-1, "B"))
msg = "does not match PeriodIndex freq"
with pytest.raises(ValueError, match=msg):
ps.shift(freq="D")
# shift other axis
# GH 6371
df = DataFrame(np.random.rand(10, 5))
expected = pd.concat(
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
ignore_index=True,
axis=1,
)
result = df.shift(1, axis=1)
assert_frame_equal(result, expected)
# shift named axis
df = DataFrame(np.random.rand(10, 5))
expected = pd.concat(
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
ignore_index=True,
axis=1,
)
result = df.shift(1, axis="columns")
assert_frame_equal(result, expected)
def test_shift_bool(self):
df = DataFrame({"high": [True, False], "low": [False, False]})
rs = df.shift(1)
xp = DataFrame(
np.array([[np.nan, np.nan], [True, False]], dtype=object),
columns=["high", "low"],
)
assert_frame_equal(rs, xp)
def test_shift_categorical(self):
# GH 9416
s1 = pd.Series(["a", "b", "c"], dtype="category")
s2 = pd.Series(["A", "B", "C"], dtype="category")
df = DataFrame({"one": s1, "two": s2})
rs = df.shift(1)
xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)})
assert_frame_equal(rs, xp)
def test_shift_fill_value(self):
# GH #24128
df = DataFrame(
[1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H")
)
exp = DataFrame(
[0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H")
)
result = df.shift(1, fill_value=0)
assert_frame_equal(result, exp)
exp = DataFrame(
[0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H")
)
result = df.shift(2, fill_value=0)
assert_frame_equal(result, exp)
def test_shift_empty(self):
# Regression test for #8019
df = DataFrame({"foo": []})
rs = df.shift(-1)
assert_frame_equal(df, rs)
def test_shift_duplicate_columns(self):
# GH 9092; verify that position-based shifting works
# in the presence of duplicate columns
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
data = np.random.randn(20, 5)
shifted = []
for columns in column_lists:
df = pd.DataFrame(data.copy(), columns=columns)
for s in range(5):
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
df.columns = range(5)
shifted.append(df)
# sanity check the base case
nulls = shifted[0].isna().sum()
assert_series_equal(nulls, Series(range(1, 6), dtype="int64"))
# check all answers are the same
assert_frame_equal(shifted[0], shifted[1])
assert_frame_equal(shifted[0], shifted[2])
def test_tshift(self):
# PeriodIndex
ps = tm.makePeriodFrame()
shifted = ps.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(unshifted, ps)
shifted2 = ps.tshift(freq="B")
assert_frame_equal(shifted, shifted2)
shifted3 = ps.tshift(freq=offsets.BDay())
assert_frame_equal(shifted, shifted3)
with pytest.raises(ValueError, match="does not match"):
ps.tshift(freq="M")
# DatetimeIndex
shifted = self.tsframe.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(self.tsframe, unshifted)
shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
assert_frame_equal(shifted, shifted2)
inferred_ts = DataFrame(
self.tsframe.values,
Index(np.asarray(self.tsframe.index)),
columns=self.tsframe.columns,
)
shifted = inferred_ts.tshift(1)
unshifted = shifted.tshift(-1)
assert_frame_equal(shifted, self.tsframe.tshift(1))
assert_frame_equal(unshifted, inferred_ts)
no_freq = self.tsframe.iloc[[0, 5, 7], :]
msg = "Freq was not given and was not set in the index"
with pytest.raises(ValueError, match=msg):
no_freq.tshift()
def test_truncate(self):
ts = self.tsframe[::3]
start, end = self.tsframe.index[3], self.tsframe.index[6]
start_missing = self.tsframe.index[2]
end_missing = self.tsframe.index[7]
# neither specified
truncated = ts.truncate()
assert_frame_equal(truncated, ts)
# both specified
expected = ts[1:3]
truncated = ts.truncate(start, end)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(start_missing, end_missing)
assert_frame_equal(truncated, expected)
# start specified
expected = ts[1:]
truncated = ts.truncate(before=start)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(before=start_missing)
assert_frame_equal(truncated, expected)
# end specified
expected = ts[:3]
truncated = ts.truncate(after=end)
assert_frame_equal(truncated, expected)
truncated = ts.truncate(after=end_missing)
assert_frame_equal(truncated, expected)
msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00"
with pytest.raises(ValueError, match=msg):
ts.truncate(
before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq
)
def test_truncate_copy(self):
index = self.tsframe.index
truncated = self.tsframe.truncate(index[5], index[10])
truncated.values[:] = 5.0
assert not (self.tsframe.values[5:11] == 5).any()
def test_truncate_nonsortedindex(self):
# GH 17935
df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0])
msg = "truncate requires a sorted index"
with pytest.raises(ValueError, match=msg):
df.truncate(before=3, after=9)
rng = pd.date_range("2011-01-01", "2012-01-01", freq="W")
ts = pd.DataFrame(
{"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng
)
msg = "truncate requires a sorted index"
with pytest.raises(ValueError, match=msg):
ts.sort_values("A", ascending=False).truncate(
before="2011-11", after="2011-12"
)
df = pd.DataFrame(
{
3: np.random.randn(5),
20: np.random.randn(5),
2: np.random.randn(5),
0: np.random.randn(5),
},
columns=[3, 20, 2, 0],
)
msg = "truncate requires a sorted index"
with pytest.raises(ValueError, match=msg):
df.truncate(before=2, after=20, axis=1)
def test_asfreq(self):
offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
rule_monthly = self.tsframe.asfreq("BM")
tm.assert_almost_equal(offset_monthly["A"], rule_monthly["A"])
filled = rule_monthly.asfreq("B", method="pad") # noqa
# TODO: actually check that this worked.
# don't forget!
filled_dep = rule_monthly.asfreq("B", method="pad") # noqa
# test does not blow up on length-0 DataFrame
zero_length = self.tsframe.reindex([])
result = zero_length.asfreq("BM")
assert result is not zero_length
def test_asfreq_datetimeindex(self):
df = DataFrame(
{"A": [1, 2, 3]},
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
)
df = df.asfreq("B")
assert isinstance(df.index, DatetimeIndex)
ts = df["A"].asfreq("B")
assert isinstance(ts.index, DatetimeIndex)
def test_asfreq_fillvalue(self):
# test for fill value during upsampling, related to issue 3715
# setup
rng = pd.date_range("1/1/2016", periods=10, freq="2S")
ts = pd.Series(np.arange(len(rng)), index=rng)
df = pd.DataFrame({"one": ts})
# insert pre-existing missing value
df.loc["2016-01-01 00:00:08", "one"] = None
actual_df = df.asfreq(freq="1S", fill_value=9.0)
expected_df = df.asfreq(freq="1S").fillna(9.0)
expected_df.loc["2016-01-01 00:00:08", "one"] = None
assert_frame_equal(expected_df, actual_df)
expected_series = ts.asfreq(freq="1S").fillna(9.0)
actual_series = ts.asfreq(freq="1S", fill_value=9.0)
assert_series_equal(expected_series, actual_series)
@pytest.mark.parametrize(
"data,idx,expected_first,expected_last",
[
({"A": [1, 2, 3]}, [1, 1, 2], 1, 2),
({"A": [1, 2, 3]}, [1, 2, 2], 1, 2),
({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"),
({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2),
({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2),
],
)
def test_first_last_valid(self, data, idx, expected_first, expected_last):
N = len(self.frame.index)
mat = np.random.randn(N)
mat[:5] = np.nan
mat[-5:] = np.nan
frame = DataFrame({"foo": mat}, index=self.frame.index)
index = frame.first_valid_index()
assert index == frame.index[5]
index = frame.last_valid_index()
assert index == frame.index[-6]
# GH12800
empty = DataFrame()
assert empty.last_valid_index() is None
assert empty.first_valid_index() is None
# GH17400: no valid entries
frame[:] = np.nan
assert frame.last_valid_index() is None
assert frame.first_valid_index() is None
# GH20499: its preserves freq with holes
frame.index = date_range("20110101", periods=N, freq="B")
frame.iloc[1] = 1
frame.iloc[-2] = 1
assert frame.first_valid_index() == frame.index[1]
assert frame.last_valid_index() == frame.index[-2]
assert frame.first_valid_index().freq == frame.index.freq
assert frame.last_valid_index().freq == frame.index.freq
# GH 21441
df = DataFrame(data, index=idx)
assert expected_first == df.first_valid_index()
assert expected_last == df.last_valid_index()
def test_first_subset(self):
ts = tm.makeTimeDataFrame(freq="12h")
result = ts.first("10d")
assert len(result) == 20
ts = tm.makeTimeDataFrame(freq="D")
result = ts.first("10d")
assert len(result) == 10
result = ts.first("3M")
expected = ts[:"3/31/2000"]
assert_frame_equal(result, expected)
result = ts.first("21D")
expected = ts[:21]
assert_frame_equal(result, expected)
result = ts[:0].first("3M")
assert_frame_equal(result, ts[:0])
def test_first_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.first("1D")
def test_last_subset(self):
ts = tm.makeTimeDataFrame(freq="12h")
result = ts.last("10d")
assert len(result) == 20
ts = tm.makeTimeDataFrame(nper=30, freq="D")
result = ts.last("10d")
assert len(result) == 10
result = ts.last("21D")
expected = ts["2000-01-10":]
assert_frame_equal(result, expected)
result = ts.last("21D")
expected = ts[-21:]
assert_frame_equal(result, expected)
result = ts[:0].last("3M")
assert_frame_equal(result, ts[:0])
def test_last_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.last("1D")
def test_at_time(self):
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
rs = ts.at_time(rng[1])
assert (rs.index.hour == rng[1].hour).all()
assert (rs.index.minute == rng[1].minute).all()
assert (rs.index.second == rng[1].second).all()
result = ts.at_time("9:30")
expected = ts.at_time(time(9, 30))
assert_frame_equal(result, expected)
result = ts.loc[time(9, 30)]
expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
assert_frame_equal(result, expected)
# midnight, everything
rng = date_range("1/1/2000", "1/31/2000")
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
result = ts.at_time(time(0, 0))
assert_frame_equal(result, ts)
# time doesn't exist
rng = date_range("1/1/2012", freq="23Min", periods=384)
ts = DataFrame(np.random.randn(len(rng), 2), rng)
rs = ts.at_time("16:00")
assert len(rs) == 0
@pytest.mark.parametrize(
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)]
)
def test_at_time_errors(self, hour):
# GH 24043
dti = pd.date_range("2018", periods=3, freq="H")
df = pd.DataFrame(list(range(len(dti))), index=dti)
if getattr(hour, "tzinfo", None) is None:
result = df.at_time(hour)
expected = df.iloc[1:2]
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(ValueError, match="Index must be timezone"):
df.at_time(hour)
def test_at_time_tz(self):
# GH 24043
dti = pd.date_range("2018", periods=3, freq="H", tz="US/Pacific")
df = pd.DataFrame(list(range(len(dti))), index=dti)
result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern")))
expected = df.iloc[1:2]
tm.assert_frame_equal(result, expected)
def test_at_time_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.at_time("00:00")
@pytest.mark.parametrize("axis", ["index", "columns", 0, 1])
def test_at_time_axis(self, axis):
# issue 8839
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(np.random.randn(len(rng), len(rng)))
ts.index, ts.columns = rng, rng
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
if axis in ["index", 0]:
expected = ts.loc[indices, :]
elif axis in ["columns", 1]:
expected = ts.loc[:, indices]
result = ts.at_time("9:30", axis=axis)
assert_frame_equal(result, expected)
def test_between_time(self, close_open_fixture):
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
stime = time(0, 0)
etime = time(1, 0)
inc_start, inc_end = close_open_fixture
filtered = ts.between_time(stime, etime, inc_start, inc_end)
exp_len = 13 * 4 + 1
if not inc_start:
exp_len -= 5
if not inc_end:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inc_start:
assert t >= stime
else:
assert t > stime
if inc_end:
assert t <= etime
else:
assert t < etime
result = ts.between_time("00:00", "01:00")
expected = ts.between_time(stime, etime)
assert_frame_equal(result, expected)
# across midnight
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
stime = time(22, 0)
etime = time(9, 0)
filtered = ts.between_time(stime, etime, inc_start, inc_end)
exp_len = (12 * 11 + 1) * 4 + 1
if not inc_start:
exp_len -= 4
if not inc_end:
exp_len -= 4
assert len(filtered) == exp_len
for rs in filtered.index:
t = rs.time()
if inc_start:
assert (t >= stime) or (t <= etime)
else:
assert (t > stime) or (t <= etime)
if inc_end:
assert (t <= etime) or (t >= stime)
else:
assert (t < etime) or (t >= stime)
def test_between_time_raises(self):
# GH20725
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(TypeError): # index is not a DatetimeIndex
df.between_time(start_time="00:00", end_time="12:00")
def test_between_time_axis(self, axis):
# issue 8839
rng = date_range("1/1/2000", periods=100, freq="10min")
ts = DataFrame(np.random.randn(len(rng), len(rng)))
stime, etime = ("08:00:00", "09:00:00")
exp_len = 7
if axis in ["index", 0]:
ts.index = rng
assert len(ts.between_time(stime, etime)) == exp_len
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
if axis in ["columns", 1]:
ts.columns = rng
selected = ts.between_time(stime, etime, axis=1).columns
assert len(selected) == exp_len
def test_between_time_axis_raises(self, axis):
# issue 8839
rng = date_range("1/1/2000", periods=100, freq="10min")
mask = np.arange(0, len(rng))
rand_data = np.random.randn(len(rng), len(rng))
ts = DataFrame(rand_data, index=rng, columns=rng)
stime, etime = ("08:00:00", "09:00:00")
msg = "Index must be DatetimeIndex"
if axis in ["columns", 1]:
ts.index = mask
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime)
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime, axis=0)
if axis in ["index", 0]:
ts.columns = mask
with pytest.raises(TypeError, match=msg):
ts.between_time(stime, etime, axis=1)
def test_operation_on_NaT(self):
# Both NaT and Timestamp are in DataFrame.
df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]})
res = df.min()
exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"])
tm.assert_series_equal(res, exp)
res = df.max()
exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"])
tm.assert_series_equal(res, exp)
# GH12941, only NaTs are in DataFrame.
df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]})
res = df.min()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
res = df.max()
exp = pd.Series([pd.NaT], index=["foo"])
tm.assert_series_equal(res, exp)
def test_datetime_assignment_with_NaT_and_diff_time_units(self):
# GH 7492
data_ns = np.array([1, "nat"], dtype="datetime64[ns]")
result = pd.Series(data_ns).to_frame()
result["new"] = data_ns
expected = pd.DataFrame(
{0: [1, None], "new": [1, None]}, dtype="datetime64[ns]"
)
tm.assert_frame_equal(result, expected)
# OutOfBoundsDatetime error shouldn't occur
data_s = np.array([1, "nat"], dtype="datetime64[s]")
result["new"] = data_s
expected = pd.DataFrame(
{0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]"
)
tm.assert_frame_equal(result, expected)
def test_frame_to_period(self):
K = 5
dr = date_range("1/1/2000", "1/1/2001")
pr = period_range("1/1/2000", "1/1/2001")
df = DataFrame(np.random.randn(len(dr), K), index=dr)
df["mix"] = "a"
pts = df.to_period()
exp = df.copy()
exp.index = pr
assert_frame_equal(pts, exp)
pts = df.to_period("M")
tm.assert_index_equal(pts.index, exp.index.asfreq("M"))
df = df.T
pts = df.to_period(axis=1)
exp = df.copy()
exp.columns = pr
assert_frame_equal(pts, exp)
pts = df.to_period("M", axis=1)
tm.assert_index_equal(pts.columns, exp.columns.asfreq("M"))
msg = "No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>"
with pytest.raises(ValueError, match=msg):
df.to_period(axis=2)
@pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"])
def test_tz_convert_and_localize(self, fn):
l0 = date_range("20140701", periods=5, freq="D")
l1 = date_range("20140701", periods=5, freq="D")
int_idx = Index(range(5))
if fn == "tz_convert":
l0 = l0.tz_localize("UTC")
l1 = l1.tz_localize("UTC")
for idx in [l0, l1]:
l0_expected = getattr(idx, fn)("US/Pacific")
l1_expected = getattr(idx, fn)("US/Pacific")
df1 = DataFrame(np.ones(5), index=l0)
df1 = getattr(df1, fn)("US/Pacific")
assert_index_equal(df1.index, l0_expected)
# MultiIndex
# GH7846
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
df3 = getattr(df2, fn)("US/Pacific", level=0)
assert not df3.index.levels[0].equals(l0)
assert_index_equal(df3.index.levels[0], l0_expected)
assert_index_equal(df3.index.levels[1], l1)
assert not df3.index.levels[1].equals(l1_expected)
df3 = getattr(df2, fn)("US/Pacific", level=1)
assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
# TODO: untested
df5 = getattr(df4, fn)("US/Pacific", level=1) # noqa
assert_index_equal(df3.index.levels[0], l0)
assert not df3.index.levels[0].equals(l0_expected)
assert_index_equal(df3.index.levels[1], l1_expected)
assert not df3.index.levels[1].equals(l1)
# Bad Inputs
# Not DatetimeIndex / PeriodIndex
with pytest.raises(TypeError, match="DatetimeIndex"):
df = DataFrame(index=int_idx)
df = getattr(df, fn)("US/Pacific")
# Not DatetimeIndex / PeriodIndex
with pytest.raises(TypeError, match="DatetimeIndex"):
df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
df = getattr(df, fn)("US/Pacific", level=0)
# Invalid level
with pytest.raises(ValueError, match="not valid"):
df = DataFrame(index=l0)
df = getattr(df, fn)("US/Pacific", level=1)

View File

@@ -0,0 +1,215 @@
"""
Tests for DataFrame timezone-related methods
"""
from datetime import datetime
import numpy as np
import pytest
import pytz
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import DataFrame, Series
from pandas.core.indexes.datetimes import date_range
import pandas.util.testing as tm
class TestDataFrameTimezones:
def test_frame_values_with_tz(self):
tz = "US/Central"
df = DataFrame({"A": date_range("2000", periods=4, tz=tz)})
result = df.values
expected = np.array(
[
[pd.Timestamp("2000-01-01", tz=tz)],
[pd.Timestamp("2000-01-02", tz=tz)],
[pd.Timestamp("2000-01-03", tz=tz)],
[pd.Timestamp("2000-01-04", tz=tz)],
]
)
tm.assert_numpy_array_equal(result, expected)
# two columns, homogenous
df = df.assign(B=df.A)
result = df.values
expected = np.concatenate([expected, expected], axis=1)
tm.assert_numpy_array_equal(result, expected)
# three columns, heterogenous
est = "US/Eastern"
df = df.assign(C=df.A.dt.tz_convert(est))
new = np.array(
[
[pd.Timestamp("2000-01-01T01:00:00", tz=est)],
[pd.Timestamp("2000-01-02T01:00:00", tz=est)],
[pd.Timestamp("2000-01-03T01:00:00", tz=est)],
[pd.Timestamp("2000-01-04T01:00:00", tz=est)],
]
)
expected = np.concatenate([expected, new], axis=1)
result = df.values
tm.assert_numpy_array_equal(result, expected)
def test_frame_from_records_utc(self):
rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)}
# it works
DataFrame.from_records([rec], index="begin_time")
def test_frame_tz_localize(self):
rng = date_range("1/1/2011", periods=100, freq="H")
df = DataFrame({"a": 1}, index=rng)
result = df.tz_localize("utc")
expected = DataFrame({"a": 1}, rng.tz_localize("UTC"))
assert result.index.tz.zone == "UTC"
tm.assert_frame_equal(result, expected)
df = df.T
result = df.tz_localize("utc", axis=1)
assert result.columns.tz.zone == "UTC"
tm.assert_frame_equal(result, expected.T)
def test_frame_tz_convert(self):
rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern")
df = DataFrame({"a": 1}, index=rng)
result = df.tz_convert("Europe/Berlin")
expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin"))
assert result.index.tz.zone == "Europe/Berlin"
tm.assert_frame_equal(result, expected)
df = df.T
result = df.tz_convert("Europe/Berlin", axis=1)
assert result.columns.tz.zone == "Europe/Berlin"
tm.assert_frame_equal(result, expected.T)
def test_frame_join_tzaware(self):
test1 = DataFrame(
np.zeros((6, 3)),
index=date_range(
"2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central"
),
)
test2 = DataFrame(
np.zeros((3, 3)),
index=date_range(
"2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"
),
columns=range(3, 6),
)
result = test1.join(test2, how="outer")
ex_index = test1.index.union(test2.index)
tm.assert_index_equal(result.index, ex_index)
assert result.index.tz.zone == "US/Central"
def test_frame_add_tz_mismatch_converts_to_utc(self):
rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern")
df = DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"])
df_moscow = df.tz_convert("Europe/Moscow")
result = df + df_moscow
assert result.index.tz is pytz.utc
result = df_moscow + df
assert result.index.tz is pytz.utc
def test_frame_align_aware(self):
idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern")
idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern")
df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
new1, new2 = df1.align(df2)
assert df1.index.tz == new1.index.tz
assert df2.index.tz == new2.index.tz
# different timezones convert to UTC
# frame with frame
df1_central = df1.tz_convert("US/Central")
new1, new2 = df1.align(df1_central)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
# frame with Series
new1, new2 = df1.align(df1_central[0], axis=0)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
df1[0].align(df1_central, axis=0)
assert new1.index.tz == pytz.UTC
assert new2.index.tz == pytz.UTC
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_frame_no_datetime64_dtype(self, tz):
# after GH#7822
# these retain the timezones on dict construction
dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
dr_tz = dr.tz_localize(tz)
df = DataFrame({"A": "foo", "B": dr_tz}, index=dr)
tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo)
assert df["B"].dtype == tz_expected
# GH#2810 (with timezones)
datetimes_naive = [ts.to_pydatetime() for ts in dr]
datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
df = DataFrame({"dr": dr})
df["dr_tz"] = dr_tz
df["datetimes_naive"] = datetimes_naive
df["datetimes_with_tz"] = datetimes_with_tz
result = df.dtypes
expected = Series(
[
np.dtype("datetime64[ns]"),
DatetimeTZDtype(tz=tz),
np.dtype("datetime64[ns]"),
DatetimeTZDtype(tz=tz),
],
index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"],
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
def test_frame_reset_index(self, tz):
dr = date_range("2012-06-02", periods=10, tz=tz)
df = DataFrame(np.random.randn(len(dr)), dr)
roundtripped = df.reset_index().set_index("index")
xp = df.index.tz
rs = roundtripped.index.tz
assert xp == rs
@pytest.mark.parametrize("tz", [None, "America/New_York"])
def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
# GH 19970
idx = date_range("20161101", "20161130", freq="4H", tz=tz)
df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx)
result = df.T == df.T
expected = DataFrame(True, index=list("ab"), columns=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("copy", [True, False])
@pytest.mark.parametrize(
"method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]]
)
def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz):
# GH 6326
result = DataFrame(
np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz)
)
getattr(result, method)("UTC", copy=copy)
expected = DataFrame(
np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz)
)
tm.assert_frame_equal(result, expected)
def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture):
# GH 25843
tz = tz_aware_fixture
result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]")
expected = DataFrame({"d": [pd.Timestamp("2019")]})
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,41 @@
import pytest
from pandas.core.frame import DataFrame
@pytest.fixture
def dataframe():
return DataFrame({"a": [1, 2], "b": [3, 4]})
class TestDataFrameValidate:
"""Tests for error handling related to data types of method arguments."""
@pytest.mark.parametrize(
"func",
[
"query",
"eval",
"set_index",
"reset_index",
"dropna",
"drop_duplicates",
"sort_values",
],
)
@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
def test_validate_bool_args(self, dataframe, func, inplace):
msg = 'For argument "inplace" expected type bool'
kwargs = dict(inplace=inplace)
if func == "query":
kwargs["expr"] = "a > b"
elif func == "eval":
kwargs["expr"] = "a + b"
elif func == "set_index":
kwargs["keys"] = ["a"]
elif func == "sort_values":
kwargs["by"] = ["a"]
with pytest.raises(ValueError, match=msg):
getattr(dataframe, func)(**kwargs)