8th day of python challenges 111-117
This commit is contained in:
155
venv/lib/python3.6/site-packages/pandas/tests/frame/common.py
Normal file
155
venv/lib/python3.6/site-packages/pandas/tests/frame/common.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import cache_readonly
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
_seriesd = tm.getSeriesData()
|
||||
_tsd = tm.getTimeSeriesData()
|
||||
|
||||
_frame = pd.DataFrame(_seriesd)
|
||||
_frame2 = pd.DataFrame(_seriesd, columns=["D", "C", "B", "A"])
|
||||
_intframe = pd.DataFrame({k: v.astype(int) for k, v in _seriesd.items()})
|
||||
|
||||
_tsframe = pd.DataFrame(_tsd)
|
||||
|
||||
_mixed_frame = _frame.copy()
|
||||
_mixed_frame["foo"] = "bar"
|
||||
|
||||
|
||||
class TestData:
|
||||
@cache_readonly
|
||||
def frame(self):
|
||||
return _frame.copy()
|
||||
|
||||
@cache_readonly
|
||||
def frame2(self):
|
||||
return _frame2.copy()
|
||||
|
||||
@cache_readonly
|
||||
def intframe(self):
|
||||
# force these all to int64 to avoid platform testing issues
|
||||
return pd.DataFrame({c: s for c, s in _intframe.items()}, dtype=np.int64)
|
||||
|
||||
@cache_readonly
|
||||
def tsframe(self):
|
||||
return _tsframe.copy()
|
||||
|
||||
@cache_readonly
|
||||
def mixed_frame(self):
|
||||
return _mixed_frame.copy()
|
||||
|
||||
@cache_readonly
|
||||
def mixed_float(self):
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"A": _frame["A"].copy().astype("float32"),
|
||||
"B": _frame["B"].copy().astype("float32"),
|
||||
"C": _frame["C"].copy().astype("float16"),
|
||||
"D": _frame["D"].copy().astype("float64"),
|
||||
}
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def mixed_float2(self):
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"A": _frame2["A"].copy().astype("float32"),
|
||||
"B": _frame2["B"].copy().astype("float32"),
|
||||
"C": _frame2["C"].copy().astype("float16"),
|
||||
"D": _frame2["D"].copy().astype("float64"),
|
||||
}
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def mixed_int(self):
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"A": _intframe["A"].copy().astype("int32"),
|
||||
"B": np.ones(len(_intframe["B"]), dtype="uint64"),
|
||||
"C": _intframe["C"].copy().astype("uint8"),
|
||||
"D": _intframe["D"].copy().astype("int64"),
|
||||
}
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def all_mixed(self):
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"float32": np.array([1.0] * 10, dtype="float32"),
|
||||
"int32": np.array([1] * 10, dtype="int32"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
|
||||
@cache_readonly
|
||||
def tzframe(self):
|
||||
result = pd.DataFrame(
|
||||
{
|
||||
"A": pd.date_range("20130101", periods=3),
|
||||
"B": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"C": pd.date_range("20130101", periods=3, tz="CET"),
|
||||
}
|
||||
)
|
||||
result.iloc[1, 1] = pd.NaT
|
||||
result.iloc[1, 2] = pd.NaT
|
||||
return result
|
||||
|
||||
@cache_readonly
|
||||
def empty(self):
|
||||
return pd.DataFrame()
|
||||
|
||||
@cache_readonly
|
||||
def ts1(self):
|
||||
return tm.makeTimeSeries(nper=30)
|
||||
|
||||
@cache_readonly
|
||||
def ts2(self):
|
||||
return tm.makeTimeSeries(nper=30)[5:]
|
||||
|
||||
@cache_readonly
|
||||
def simple(self):
|
||||
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
|
||||
|
||||
return pd.DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])
|
||||
|
||||
|
||||
# self.ts3 = tm.makeTimeSeries()[-5:]
|
||||
# self.ts4 = tm.makeTimeSeries()[1:-1]
|
||||
|
||||
|
||||
def _check_mixed_float(df, dtype=None):
|
||||
# float16 are most likely to be upcasted to float32
|
||||
dtypes = dict(A="float32", B="float32", C="float16", D="float64")
|
||||
if isinstance(dtype, str):
|
||||
dtypes = {k: dtype for k, v in dtypes.items()}
|
||||
elif isinstance(dtype, dict):
|
||||
dtypes.update(dtype)
|
||||
if dtypes.get("A"):
|
||||
assert df.dtypes["A"] == dtypes["A"]
|
||||
if dtypes.get("B"):
|
||||
assert df.dtypes["B"] == dtypes["B"]
|
||||
if dtypes.get("C"):
|
||||
assert df.dtypes["C"] == dtypes["C"]
|
||||
if dtypes.get("D"):
|
||||
assert df.dtypes["D"] == dtypes["D"]
|
||||
|
||||
|
||||
def _check_mixed_int(df, dtype=None):
|
||||
dtypes = dict(A="int32", B="uint64", C="uint8", D="int64")
|
||||
if isinstance(dtype, str):
|
||||
dtypes = {k: dtype for k, v in dtypes.items()}
|
||||
elif isinstance(dtype, dict):
|
||||
dtypes.update(dtype)
|
||||
if dtypes.get("A"):
|
||||
assert df.dtypes["A"] == dtypes["A"]
|
||||
if dtypes.get("B"):
|
||||
assert df.dtypes["B"] == dtypes["B"]
|
||||
if dtypes.get("C"):
|
||||
assert df.dtypes["C"] == dtypes["C"]
|
||||
if dtypes.get("D"):
|
||||
assert df.dtypes["D"] == dtypes["D"]
|
||||
330
venv/lib/python3.6/site-packages/pandas/tests/frame/conftest.py
Normal file
330
venv/lib/python3.6/site-packages/pandas/tests/frame/conftest.py
Normal file
@@ -0,0 +1,330 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, NaT, date_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def float_frame_with_na():
|
||||
"""
|
||||
Fixture for DataFrame of floats with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']; some entries are missing
|
||||
|
||||
A B C D
|
||||
ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997
|
||||
DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872
|
||||
neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522
|
||||
0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018
|
||||
3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826
|
||||
soujjZ0A08 NaN NaN NaN NaN
|
||||
7W6NLGsjB9 NaN NaN NaN NaN
|
||||
... ... ... ... ...
|
||||
uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590
|
||||
n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717
|
||||
ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189
|
||||
uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503
|
||||
3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947
|
||||
2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083
|
||||
sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517
|
||||
|
||||
[30 rows x 4 columns]
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
# set some NAs
|
||||
df.loc[5:10] = np.nan
|
||||
df.loc[15:20, -2:] = np.nan
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bool_frame_with_na():
|
||||
"""
|
||||
Fixture for DataFrame of booleans with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']; some entries are missing
|
||||
|
||||
A B C D
|
||||
zBZxY2IDGd False False False False
|
||||
IhBWBMWllt False True True True
|
||||
ctjdvZSR6R True False True True
|
||||
AVTujptmxb False True False True
|
||||
G9lrImrSWq False False False True
|
||||
sFFwdIUfz2 NaN NaN NaN NaN
|
||||
s15ptEJnRb NaN NaN NaN NaN
|
||||
... ... ... ... ...
|
||||
UW41KkDyZ4 True True False False
|
||||
l9l6XkOdqV True False False False
|
||||
X2MeZfzDYA False True False False
|
||||
xWkIKU7vfX False True False True
|
||||
QOhL6VmpGU False False False True
|
||||
22PwkRJdat False True False False
|
||||
kfboQ3VeIK True False True False
|
||||
|
||||
[30 rows x 4 columns]
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData()) > 0
|
||||
df = df.astype(object)
|
||||
# set some NAs
|
||||
df.loc[5:10] = np.nan
|
||||
df.loc[15:20, -2:] = np.nan
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def int_frame():
|
||||
"""
|
||||
Fixture for DataFrame of ints with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']
|
||||
|
||||
A B C D
|
||||
vpBeWjM651 1 0 1 0
|
||||
5JyxmrP1En -1 0 0 0
|
||||
qEDaoD49U2 -1 1 0 0
|
||||
m66TkTfsFe 0 0 0 0
|
||||
EHPaNzEUFm -1 0 -1 0
|
||||
fpRJCevQhi 2 0 0 0
|
||||
OlQvnmfi3Q 0 0 -2 0
|
||||
... .. .. .. ..
|
||||
uB1FPlz4uP 0 0 0 1
|
||||
EcSe6yNzCU 0 0 -1 0
|
||||
L50VudaiI8 -1 1 -2 0
|
||||
y3bpw4nwIp 0 -1 0 0
|
||||
H0RdLLwrCT 1 1 0 0
|
||||
rY82K0vMwm 0 0 0 0
|
||||
1OPIUjnkjk 2 0 0 0
|
||||
|
||||
[30 rows x 4 columns]
|
||||
"""
|
||||
df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
|
||||
# force these all to int64 to avoid platform testing issues
|
||||
return DataFrame({c: s for c, s in df.items()}, dtype=np.int64)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def datetime_frame():
|
||||
"""
|
||||
Fixture for DataFrame of floats with DatetimeIndex
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D']
|
||||
|
||||
A B C D
|
||||
2000-01-03 -1.122153 0.468535 0.122226 1.693711
|
||||
2000-01-04 0.189378 0.486100 0.007864 -1.216052
|
||||
2000-01-05 0.041401 -0.835752 -0.035279 -0.414357
|
||||
2000-01-06 0.430050 0.894352 0.090719 0.036939
|
||||
2000-01-07 -0.620982 -0.668211 -0.706153 1.466335
|
||||
2000-01-10 -0.752633 0.328434 -0.815325 0.699674
|
||||
2000-01-11 -2.236969 0.615737 -0.829076 -1.196106
|
||||
... ... ... ... ...
|
||||
2000-02-03 1.642618 -0.579288 0.046005 1.385249
|
||||
2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351
|
||||
2000-02-07 -2.656149 -0.601387 1.410148 0.444150
|
||||
2000-02-08 -1.201881 -1.289040 0.772992 -1.445300
|
||||
2000-02-09 1.377373 0.398619 1.008453 -0.928207
|
||||
2000-02-10 0.473194 -0.636677 0.984058 0.511519
|
||||
2000-02-11 -0.965556 0.408313 -1.312844 -0.381948
|
||||
|
||||
[30 rows x 4 columns]
|
||||
"""
|
||||
return DataFrame(tm.getTimeSeriesData())
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def float_string_frame():
|
||||
"""
|
||||
Fixture for DataFrame of floats and strings with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D', 'foo'].
|
||||
|
||||
A B C D foo
|
||||
w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar
|
||||
PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar
|
||||
ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar
|
||||
3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar
|
||||
khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar
|
||||
LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar
|
||||
HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar
|
||||
... ... ... ... ... ...
|
||||
9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar
|
||||
h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar
|
||||
mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar
|
||||
oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar
|
||||
9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar
|
||||
jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar
|
||||
lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar
|
||||
|
||||
[30 rows x 5 columns]
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
df["foo"] = "bar"
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_float_frame():
|
||||
"""
|
||||
Fixture for DataFrame of different float types with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D'].
|
||||
|
||||
A B C D
|
||||
GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993
|
||||
KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588
|
||||
VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731
|
||||
kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607
|
||||
CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266
|
||||
0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541
|
||||
tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710
|
||||
... ... ... ... ...
|
||||
7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237
|
||||
4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612
|
||||
B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653
|
||||
hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427
|
||||
1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827
|
||||
9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204
|
||||
xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502
|
||||
|
||||
[30 rows x 4 columns]
|
||||
"""
|
||||
df = DataFrame(tm.getSeriesData())
|
||||
df.A = df.A.astype("float32")
|
||||
df.B = df.B.astype("float32")
|
||||
df.C = df.C.astype("float16")
|
||||
df.D = df.D.astype("float64")
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_int_frame():
|
||||
"""
|
||||
Fixture for DataFrame of different int types with index of unique strings
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D'].
|
||||
|
||||
A B C D
|
||||
mUrCZ67juP 0 1 2 2
|
||||
rw99ACYaKS 0 1 0 0
|
||||
7QsEcpaaVU 0 1 1 1
|
||||
xkrimI2pcE 0 1 0 0
|
||||
dz01SuzoS8 0 1 255 255
|
||||
ccQkqOHX75 -1 1 0 0
|
||||
DN0iXaoDLd 0 1 0 0
|
||||
... .. .. ... ...
|
||||
Dfb141wAaQ 1 1 254 254
|
||||
IPD8eQOVu5 0 1 0 0
|
||||
CcaKulsCmv 0 1 0 0
|
||||
rIBa8gu7E5 0 1 0 0
|
||||
RP6peZmh5o 0 1 1 1
|
||||
NMb9pipQWQ 0 1 0 0
|
||||
PqgbJEzjib 0 1 3 3
|
||||
|
||||
[30 rows x 4 columns]
|
||||
"""
|
||||
df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
|
||||
df.A = df.A.astype("int32")
|
||||
df.B = np.ones(len(df.B), dtype="uint64")
|
||||
df.C = df.C.astype("uint8")
|
||||
df.D = df.C.astype("int64")
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_type_frame():
|
||||
"""
|
||||
Fixture for DataFrame of float/int/string columns with RangeIndex
|
||||
Columns are ['a', 'b', 'c', 'float32', 'int32'].
|
||||
"""
|
||||
return DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"float32": np.array([1.0] * 10, dtype="float32"),
|
||||
"int32": np.array([1] * 10, dtype="int32"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def timezone_frame():
|
||||
"""
|
||||
Fixture for DataFrame of date_range Series with different time zones
|
||||
|
||||
Columns are ['A', 'B', 'C']; some entries are missing
|
||||
|
||||
A B C
|
||||
0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00
|
||||
1 2013-01-02 NaT NaT
|
||||
2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": date_range("20130101", periods=3),
|
||||
"B": date_range("20130101", periods=3, tz="US/Eastern"),
|
||||
"C": date_range("20130101", periods=3, tz="CET"),
|
||||
}
|
||||
)
|
||||
df.iloc[1, 1] = NaT
|
||||
df.iloc[1, 2] = NaT
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def uint64_frame():
|
||||
"""
|
||||
Fixture for DataFrame with uint64 values
|
||||
|
||||
Columns are ['A', 'B']
|
||||
"""
|
||||
return DataFrame(
|
||||
{"A": np.arange(3), "B": [2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10]}, dtype=np.uint64
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def simple_frame():
|
||||
"""
|
||||
Fixture for simple 3x3 DataFrame
|
||||
|
||||
Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
|
||||
|
||||
one two three
|
||||
a 1.0 2.0 3.0
|
||||
b 4.0 5.0 6.0
|
||||
c 7.0 8.0 9.0
|
||||
"""
|
||||
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
|
||||
|
||||
return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_of_index_cols():
|
||||
"""
|
||||
Fixture for DataFrame of columns that can be used for indexing
|
||||
|
||||
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
|
||||
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
|
||||
|
||||
A B C D E (tuple, as, label)
|
||||
0 foo one a 0.608477 -0.012500 -1.664297
|
||||
1 foo two b -0.633460 0.249614 -0.364411
|
||||
2 foo three c 0.615256 2.154968 -0.834666
|
||||
3 bar one d 0.234246 1.085675 0.718445
|
||||
4 bar two e 0.533841 -0.005702 -3.533912
|
||||
"""
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "foo", "foo", "bar", "bar"],
|
||||
"B": ["one", "two", "three", "one", "two"],
|
||||
"C": ["a", "b", "c", "d", "e"],
|
||||
"D": np.random.randn(5),
|
||||
"E": np.random.randn(5),
|
||||
("tuple", "as", "label"): np.random.randn(5),
|
||||
}
|
||||
)
|
||||
return df
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
589
venv/lib/python3.6/site-packages/pandas/tests/frame/test_api.py
Normal file
589
venv/lib/python3.6/site-packages/pandas/tests/frame/test_api.py
Normal file
@@ -0,0 +1,589 @@
|
||||
from copy import deepcopy
|
||||
import datetime
|
||||
import pydoc
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
SparseDataFrame,
|
||||
SparseDtype,
|
||||
compat,
|
||||
date_range,
|
||||
timedelta_range,
|
||||
)
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_almost_equal,
|
||||
assert_frame_equal,
|
||||
assert_series_equal,
|
||||
)
|
||||
|
||||
|
||||
class SharedWithSparse:
|
||||
"""
|
||||
A collection of tests DataFrame and SparseDataFrame can share.
|
||||
|
||||
In generic tests on this class, use ``self._assert_frame_equal()`` and
|
||||
``self._assert_series_equal()`` which are implemented in sub-classes
|
||||
and dispatch correctly.
|
||||
"""
|
||||
|
||||
def _assert_frame_equal(self, left, right):
|
||||
"""Dispatch to frame class dependent assertion"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _assert_series_equal(self, left, right):
|
||||
"""Dispatch to series class dependent assertion"""
|
||||
raise NotImplementedError
|
||||
|
||||
def test_copy_index_name_checking(self, float_frame):
|
||||
# don't want to be able to modify the index stored elsewhere after
|
||||
# making a copy
|
||||
for attr in ("index", "columns"):
|
||||
ind = getattr(float_frame, attr)
|
||||
ind.name = None
|
||||
cp = float_frame.copy()
|
||||
getattr(cp, attr).name = "foo"
|
||||
assert getattr(float_frame, attr).name is None
|
||||
|
||||
def test_getitem_pop_assign_name(self, float_frame):
|
||||
s = float_frame["A"]
|
||||
assert s.name == "A"
|
||||
|
||||
s = float_frame.pop("A")
|
||||
assert s.name == "A"
|
||||
|
||||
s = float_frame.loc[:, "B"]
|
||||
assert s.name == "B"
|
||||
|
||||
s2 = s.loc[:]
|
||||
assert s2.name == "B"
|
||||
|
||||
def test_get_value(self, float_frame):
|
||||
for idx in float_frame.index:
|
||||
for col in float_frame.columns:
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
result = float_frame.get_value(idx, col)
|
||||
expected = float_frame[col][idx]
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_add_prefix_suffix(self, float_frame):
|
||||
with_prefix = float_frame.add_prefix("foo#")
|
||||
expected = pd.Index(["foo#{c}".format(c=c) for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_suffix = float_frame.add_suffix("#foo")
|
||||
expected = pd.Index(["{c}#foo".format(c=c) for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_suffix.columns, expected)
|
||||
|
||||
with_pct_prefix = float_frame.add_prefix("%")
|
||||
expected = pd.Index(["%{c}".format(c=c) for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("%")
|
||||
expected = pd.Index(["{c}%".format(c=c) for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
def test_get_axis(self, float_frame):
|
||||
f = float_frame
|
||||
assert f._get_axis_number(0) == 0
|
||||
assert f._get_axis_number(1) == 1
|
||||
assert f._get_axis_number("index") == 0
|
||||
assert f._get_axis_number("rows") == 0
|
||||
assert f._get_axis_number("columns") == 1
|
||||
|
||||
assert f._get_axis_name(0) == "index"
|
||||
assert f._get_axis_name(1) == "columns"
|
||||
assert f._get_axis_name("index") == "index"
|
||||
assert f._get_axis_name("rows") == "index"
|
||||
assert f._get_axis_name("columns") == "columns"
|
||||
|
||||
assert f._get_axis(0) is f.index
|
||||
assert f._get_axis(1) is f.columns
|
||||
|
||||
with pytest.raises(ValueError, match="No axis named"):
|
||||
f._get_axis_number(2)
|
||||
|
||||
with pytest.raises(ValueError, match="No axis.*foo"):
|
||||
f._get_axis_name("foo")
|
||||
|
||||
with pytest.raises(ValueError, match="No axis.*None"):
|
||||
f._get_axis_name(None)
|
||||
|
||||
with pytest.raises(ValueError, match="No axis named"):
|
||||
f._get_axis_number(None)
|
||||
|
||||
def test_keys(self, float_frame):
|
||||
getkeys = float_frame.keys
|
||||
assert getkeys() is float_frame.columns
|
||||
|
||||
def test_column_contains_raises(self, float_frame):
|
||||
with pytest.raises(TypeError, match="unhashable type: 'Index'"):
|
||||
float_frame.columns in float_frame
|
||||
|
||||
def test_tab_completion(self):
|
||||
# DataFrame whose columns are identifiers shall have them in __dir__.
|
||||
df = pd.DataFrame([list("abcd"), list("efgh")], columns=list("ABCD"))
|
||||
for key in list("ABCD"):
|
||||
assert key in dir(df)
|
||||
assert isinstance(df.__getitem__("A"), pd.Series)
|
||||
|
||||
# DataFrame whose first-level columns are identifiers shall have
|
||||
# them in __dir__.
|
||||
df = pd.DataFrame(
|
||||
[list("abcd"), list("efgh")],
|
||||
columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))),
|
||||
)
|
||||
for key in list("ABCD"):
|
||||
assert key in dir(df)
|
||||
for key in list("EFGH"):
|
||||
assert key not in dir(df)
|
||||
assert isinstance(df.__getitem__("A"), pd.DataFrame)
|
||||
|
||||
def test_not_hashable(self):
|
||||
empty_frame = DataFrame()
|
||||
|
||||
df = self.klass([1])
|
||||
msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be hashed"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
hash(df)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
hash(empty_frame)
|
||||
|
||||
def test_new_empty_index(self):
|
||||
df1 = self.klass(np.random.randn(0, 3))
|
||||
df2 = self.klass(np.random.randn(0, 3))
|
||||
df1.index.name = "foo"
|
||||
assert df2.index.name is None
|
||||
|
||||
def test_array_interface(self, float_frame):
|
||||
with np.errstate(all="ignore"):
|
||||
result = np.sqrt(float_frame)
|
||||
assert isinstance(result, type(float_frame))
|
||||
assert result.index is float_frame.index
|
||||
assert result.columns is float_frame.columns
|
||||
|
||||
self._assert_frame_equal(result, float_frame.apply(np.sqrt))
|
||||
|
||||
def test_get_agg_axis(self, float_frame):
|
||||
cols = float_frame._get_agg_axis(0)
|
||||
assert cols is float_frame.columns
|
||||
|
||||
idx = float_frame._get_agg_axis(1)
|
||||
assert idx is float_frame.index
|
||||
|
||||
msg = r"Axis must be 0 or 1 \(got 2\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame._get_agg_axis(2)
|
||||
|
||||
def test_nonzero(self, float_frame, float_string_frame):
|
||||
empty_frame = DataFrame()
|
||||
assert empty_frame.empty
|
||||
|
||||
assert not float_frame.empty
|
||||
assert not float_string_frame.empty
|
||||
|
||||
# corner case
|
||||
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}, index=np.arange(3))
|
||||
del df["A"]
|
||||
assert not df.empty
|
||||
|
||||
def test_iteritems(self):
|
||||
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
|
||||
for k, v in df.items():
|
||||
assert isinstance(v, self.klass._constructor_sliced)
|
||||
|
||||
def test_items(self):
|
||||
# GH 17213, GH 13918
|
||||
cols = ["a", "b", "c"]
|
||||
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
|
||||
for c, (k, v) in zip(cols, df.items()):
|
||||
assert c == k
|
||||
assert isinstance(v, Series)
|
||||
assert (df[k] == v).all()
|
||||
|
||||
def test_iter(self, float_frame):
|
||||
assert tm.equalContents(list(float_frame), float_frame.columns)
|
||||
|
||||
def test_iterrows(self, float_frame, float_string_frame):
|
||||
for k, v in float_frame.iterrows():
|
||||
exp = float_frame.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
for k, v in float_string_frame.iterrows():
|
||||
exp = float_string_frame.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
def test_iterrows_iso8601(self):
|
||||
# GH 19671
|
||||
if self.klass == SparseDataFrame:
|
||||
pytest.xfail(reason="SparseBlock datetime type not implemented.")
|
||||
|
||||
s = self.klass(
|
||||
{
|
||||
"non_iso8601": ["M1701", "M1802", "M1903", "M2004"],
|
||||
"iso8601": date_range("2000-01-01", periods=4, freq="M"),
|
||||
}
|
||||
)
|
||||
for k, v in s.iterrows():
|
||||
exp = s.loc[k]
|
||||
self._assert_series_equal(v, exp)
|
||||
|
||||
def test_iterrows_corner(self):
|
||||
# gh-12222
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [datetime.datetime(2015, 1, 1)],
|
||||
"b": [None],
|
||||
"c": [None],
|
||||
"d": [""],
|
||||
"e": [[]],
|
||||
"f": [set()],
|
||||
"g": [{}],
|
||||
}
|
||||
)
|
||||
expected = Series(
|
||||
[datetime.datetime(2015, 1, 1), None, None, "", [], set(), {}],
|
||||
index=list("abcdefg"),
|
||||
name=0,
|
||||
dtype="object",
|
||||
)
|
||||
_, result = next(df.iterrows())
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_itertuples(self, float_frame):
|
||||
for i, tup in enumerate(float_frame.itertuples()):
|
||||
s = self.klass._constructor_sliced(tup[1:])
|
||||
s.name = tup[0]
|
||||
expected = float_frame.iloc[i, :].reset_index(drop=True)
|
||||
self._assert_series_equal(s, expected)
|
||||
|
||||
df = self.klass(
|
||||
{"floats": np.random.randn(5), "ints": range(5)}, columns=["floats", "ints"]
|
||||
)
|
||||
|
||||
for tup in df.itertuples(index=False):
|
||||
assert isinstance(tup[1], int)
|
||||
|
||||
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
dfaa = df[["a", "a"]]
|
||||
|
||||
assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]
|
||||
|
||||
# repr with int on 32-bit/windows
|
||||
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
|
||||
assert (
|
||||
repr(list(df.itertuples(name=None)))
|
||||
== "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
|
||||
)
|
||||
|
||||
tup = next(df.itertuples(name="TestName"))
|
||||
assert tup._fields == ("Index", "a", "b")
|
||||
assert (tup.Index, tup.a, tup.b) == tup
|
||||
assert type(tup).__name__ == "TestName"
|
||||
|
||||
df.columns = ["def", "return"]
|
||||
tup2 = next(df.itertuples(name="TestName"))
|
||||
assert tup2 == (0, 1, 4)
|
||||
assert tup2._fields == ("Index", "_1", "_2")
|
||||
|
||||
df3 = DataFrame({"f" + str(i): [i] for i in range(1024)})
|
||||
# will raise SyntaxError if trying to create namedtuple
|
||||
tup3 = next(df3.itertuples())
|
||||
assert not hasattr(tup3, "_fields")
|
||||
assert isinstance(tup3, tuple)
|
||||
|
||||
def test_sequence_like_with_categorical(self):
|
||||
|
||||
# GH 7839
|
||||
# make sure can iterate
|
||||
df = DataFrame(
|
||||
{"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
|
||||
)
|
||||
df["grade"] = Categorical(df["raw_grade"])
|
||||
|
||||
# basic sequencing testing
|
||||
result = list(df.grade.values)
|
||||
expected = np.array(df.grade.values).tolist()
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
# iteration
|
||||
for t in df.itertuples(index=False):
|
||||
str(t)
|
||||
|
||||
for row, s in df.iterrows():
|
||||
str(s)
|
||||
|
||||
for c, col in df.items():
|
||||
str(s)
|
||||
|
||||
def test_len(self, float_frame):
|
||||
assert len(float_frame) == len(float_frame.index)
|
||||
|
||||
def test_values(self, float_frame, float_string_frame):
|
||||
frame = float_frame
|
||||
arr = frame.values
|
||||
|
||||
frame_cols = frame.columns
|
||||
for i, row in enumerate(arr):
|
||||
for j, value in enumerate(row):
|
||||
col = frame_cols[j]
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][i])
|
||||
else:
|
||||
assert value == frame[col][i]
|
||||
|
||||
# mixed type
|
||||
arr = float_string_frame[["foo", "A"]].values
|
||||
assert arr[0, 0] == "bar"
|
||||
|
||||
df = self.klass({"complex": [1j, 2j, 3j], "real": [1, 2, 3]})
|
||||
arr = df.values
|
||||
assert arr[0, 0] == 1j
|
||||
|
||||
# single block corner case
|
||||
arr = float_frame[["A", "B"]].values
|
||||
expected = float_frame.reindex(columns=["A", "B"]).values
|
||||
assert_almost_equal(arr, expected)
|
||||
|
||||
def test_to_numpy(self):
|
||||
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
|
||||
expected = np.array([[1, 3], [2, 4.5]])
|
||||
result = df.to_numpy()
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_numpy_dtype(self):
|
||||
df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
|
||||
expected = np.array([[1, 3], [2, 4]], dtype="int64")
|
||||
result = df.to_numpy(dtype="int64")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_to_numpy_copy(self):
|
||||
arr = np.random.randn(4, 3)
|
||||
df = pd.DataFrame(arr)
|
||||
assert df.values.base is arr
|
||||
assert df.to_numpy(copy=False).base is arr
|
||||
assert df.to_numpy(copy=True).base is None
|
||||
|
||||
def test_transpose(self, float_frame):
|
||||
frame = float_frame
|
||||
dft = frame.T
|
||||
for idx, series in dft.items():
|
||||
for col, value in series.items():
|
||||
if np.isnan(value):
|
||||
assert np.isnan(frame[col][idx])
|
||||
else:
|
||||
assert value == frame[col][idx]
|
||||
|
||||
# mixed type
|
||||
index, data = tm.getMixedTypeDict()
|
||||
mixed = self.klass(data, index=index)
|
||||
|
||||
mixed_T = mixed.T
|
||||
for col, s in mixed_T.items():
|
||||
assert s.dtype == np.object_
|
||||
|
||||
def test_swapaxes(self):
|
||||
df = self.klass(np.random.randn(10, 5))
|
||||
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
|
||||
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
|
||||
self._assert_frame_equal(df, df.swapaxes(0, 0))
|
||||
msg = (
|
||||
"No axis named 2 for object type"
|
||||
r" <class 'pandas.core(.sparse)?.frame.(Sparse)?DataFrame'>"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.swapaxes(2, 5)
|
||||
|
||||
def test_axis_aliases(self, float_frame):
|
||||
f = float_frame
|
||||
|
||||
# reg name
|
||||
expected = f.sum(axis=0)
|
||||
result = f.sum(axis="index")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
expected = f.sum(axis=1)
|
||||
result = f.sum(axis="columns")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_class_axis(self):
|
||||
# GH 18147
|
||||
# no exception and no empty docstring
|
||||
assert pydoc.getdoc(DataFrame.index)
|
||||
assert pydoc.getdoc(DataFrame.columns)
|
||||
|
||||
def test_more_values(self, float_string_frame):
|
||||
values = float_string_frame.values
|
||||
assert values.shape[1] == len(float_string_frame.columns)
|
||||
|
||||
def test_repr_with_mi_nat(self, float_string_frame):
|
||||
df = self.klass(
|
||||
{"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]]
|
||||
)
|
||||
result = repr(df)
|
||||
expected = " X\nNaT a 1\n2013-01-01 b 2"
|
||||
assert result == expected
|
||||
|
||||
def test_items_names(self, float_string_frame):
|
||||
for k, v in float_string_frame.items():
|
||||
assert v.name == k
|
||||
|
||||
def test_series_put_names(self, float_string_frame):
|
||||
series = float_string_frame._series
|
||||
for k, v in series.items():
|
||||
assert v.name == k
|
||||
|
||||
def test_empty_nonzero(self):
|
||||
df = self.klass([1, 2, 3])
|
||||
assert not df.empty
|
||||
df = self.klass(index=[1], columns=[1])
|
||||
assert not df.empty
|
||||
df = self.klass(index=["a", "b"], columns=["c", "d"]).dropna()
|
||||
assert df.empty
|
||||
assert df.T.empty
|
||||
empty_frames = [
|
||||
self.klass(),
|
||||
self.klass(index=[1]),
|
||||
self.klass(columns=[1]),
|
||||
self.klass({1: []}),
|
||||
]
|
||||
for df in empty_frames:
|
||||
assert df.empty
|
||||
assert df.T.empty
|
||||
|
||||
def test_with_datetimelikes(self):
|
||||
|
||||
df = self.klass(
|
||||
{
|
||||
"A": date_range("20130101", periods=10),
|
||||
"B": timedelta_range("1 day", periods=10),
|
||||
}
|
||||
)
|
||||
t = df.T
|
||||
|
||||
result = t.dtypes.value_counts()
|
||||
if self.klass is DataFrame:
|
||||
expected = Series({np.dtype("object"): 10})
|
||||
else:
|
||||
expected = Series({SparseDtype(dtype=object): 10})
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameMisc(SharedWithSparse):
|
||||
|
||||
klass = DataFrame
|
||||
# SharedWithSparse tests use generic, klass-agnostic assertion
|
||||
_assert_frame_equal = staticmethod(assert_frame_equal)
|
||||
_assert_series_equal = staticmethod(assert_series_equal)
|
||||
|
||||
def test_values(self, float_frame):
|
||||
float_frame.values[:, 0] = 5.0
|
||||
assert (float_frame.values[:, 0] == 5).all()
|
||||
|
||||
def test_as_matrix_deprecated(self, float_frame):
|
||||
# GH 18458
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
cols = float_frame.columns.tolist()
|
||||
result = float_frame.as_matrix(columns=cols)
|
||||
expected = float_frame.values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_deepcopy(self, float_frame):
|
||||
cp = deepcopy(float_frame)
|
||||
series = cp["A"]
|
||||
series[:] = 10
|
||||
for idx, value in series.items():
|
||||
assert float_frame["A"][idx] != value
|
||||
|
||||
def test_transpose_get_view(self, float_frame):
|
||||
dft = float_frame.T
|
||||
dft.values[:, 5:10] = 5
|
||||
|
||||
assert (float_frame.values[5:10] == 5).all()
|
||||
|
||||
def test_inplace_return_self(self):
|
||||
# GH 1893
|
||||
|
||||
data = DataFrame(
|
||||
{"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]}
|
||||
)
|
||||
|
||||
def _check_f(base, f):
|
||||
result = f(base)
|
||||
assert result is None
|
||||
|
||||
# -----DataFrame-----
|
||||
|
||||
# set_index
|
||||
f = lambda x: x.set_index("a", inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# reset_index
|
||||
f = lambda x: x.reset_index(inplace=True)
|
||||
_check_f(data.set_index("a"), f)
|
||||
|
||||
# drop_duplicates
|
||||
f = lambda x: x.drop_duplicates(inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# sort
|
||||
f = lambda x: x.sort_values("b", inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# sort_index
|
||||
f = lambda x: x.sort_index(inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# fillna
|
||||
f = lambda x: x.fillna(0, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# replace
|
||||
f = lambda x: x.replace(1, 0, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# rename
|
||||
f = lambda x: x.rename({1: "foo"}, inplace=True)
|
||||
_check_f(data.copy(), f)
|
||||
|
||||
# -----Series-----
|
||||
d = data.copy()["c"]
|
||||
|
||||
# reset_index
|
||||
f = lambda x: x.reset_index(inplace=True, drop=True)
|
||||
_check_f(data.set_index("a")["c"], f)
|
||||
|
||||
# fillna
|
||||
f = lambda x: x.fillna(0, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
# replace
|
||||
f = lambda x: x.replace(1, 0, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
# rename
|
||||
f = lambda x: x.rename({1: "foo"}, inplace=True)
|
||||
_check_f(d.copy(), f)
|
||||
|
||||
def test_tab_complete_warning(self, ip):
|
||||
# GH 16409
|
||||
pytest.importorskip("IPython", minversion="6.0.0")
|
||||
from IPython.core.completer import provisionalcompleter
|
||||
|
||||
code = "import pandas as pd; df = pd.DataFrame()"
|
||||
ip.run_code(code)
|
||||
with tm.assert_produces_warning(None):
|
||||
with provisionalcompleter("ignore"):
|
||||
list(ip.Completer.completions("df.", 1))
|
||||
|
||||
def test_get_values_deprecated(self):
|
||||
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
res = df.get_values()
|
||||
tm.assert_numpy_array_equal(res, df.values)
|
||||
1338
venv/lib/python3.6/site-packages/pandas/tests/frame/test_apply.py
Normal file
1338
venv/lib/python3.6/site-packages/pandas/tests/frame/test_apply.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,644 @@
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int
|
||||
import pandas.util.testing as tm
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Comparisons
|
||||
|
||||
|
||||
class TestFrameComparisons:
|
||||
# Specifically _not_ flex-comparisons
|
||||
|
||||
def test_comparison_invalid(self):
|
||||
def check(df, df2):
|
||||
|
||||
for (x, y) in [(df, df2), (df2, df)]:
|
||||
# we expect the result to match Series comparisons for
|
||||
# == and !=, inequalities should raise
|
||||
result = x == y
|
||||
expected = pd.DataFrame(
|
||||
{col: x[col] == y[col] for col in x.columns},
|
||||
index=x.index,
|
||||
columns=x.columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = x != y
|
||||
expected = pd.DataFrame(
|
||||
{col: x[col] != y[col] for col in x.columns},
|
||||
index=x.index,
|
||||
columns=x.columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
x >= y
|
||||
with pytest.raises(TypeError):
|
||||
x > y
|
||||
with pytest.raises(TypeError):
|
||||
x < y
|
||||
with pytest.raises(TypeError):
|
||||
x <= y
|
||||
|
||||
# GH4968
|
||||
# invalid date/int comparisons
|
||||
df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"])
|
||||
df["dates"] = pd.date_range("20010101", periods=len(df))
|
||||
|
||||
df2 = df.copy()
|
||||
df2["dates"] = df["a"]
|
||||
check(df, df2)
|
||||
|
||||
df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"])
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"a": pd.date_range("20010101", periods=len(df)),
|
||||
"b": pd.date_range("20100101", periods=len(df)),
|
||||
}
|
||||
)
|
||||
check(df, df2)
|
||||
|
||||
def test_timestamp_compare(self):
|
||||
# make sure we can compare Timestamps on the right AND left hand side
|
||||
# GH#4982
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"dates1": pd.date_range("20010101", periods=10),
|
||||
"dates2": pd.date_range("20010102", periods=10),
|
||||
"intcol": np.random.randint(1000000000, size=10),
|
||||
"floatcol": np.random.randn(10),
|
||||
"stringcol": list(tm.rands(10)),
|
||||
}
|
||||
)
|
||||
df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT
|
||||
ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"}
|
||||
|
||||
for left, right in ops.items():
|
||||
left_f = getattr(operator, left)
|
||||
right_f = getattr(operator, right)
|
||||
|
||||
# no nats
|
||||
if left in ["eq", "ne"]:
|
||||
expected = left_f(df, pd.Timestamp("20010109"))
|
||||
result = right_f(pd.Timestamp("20010109"), df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(TypeError):
|
||||
left_f(df, pd.Timestamp("20010109"))
|
||||
with pytest.raises(TypeError):
|
||||
right_f(pd.Timestamp("20010109"), df)
|
||||
# nats
|
||||
expected = left_f(df, pd.Timestamp("nat"))
|
||||
result = right_f(pd.Timestamp("nat"), df)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_mixed_comparison(self):
|
||||
# GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
|
||||
# not raise TypeError
|
||||
# (this appears to be fixed before GH#22163, not sure when)
|
||||
df = pd.DataFrame([["1989-08-01", 1], ["1989-08-01", 2]])
|
||||
other = pd.DataFrame([["a", "b"], ["c", "d"]])
|
||||
|
||||
result = df == other
|
||||
assert not result.any().any()
|
||||
|
||||
result = df != other
|
||||
assert result.all().all()
|
||||
|
||||
def test_df_boolean_comparison_error(self):
|
||||
# GH#4576, GH#22880
|
||||
# comparing DataFrame against list/tuple with len(obj) matching
|
||||
# len(df.columns) is supported as of GH#22800
|
||||
df = pd.DataFrame(np.arange(6).reshape((3, 2)))
|
||||
|
||||
expected = pd.DataFrame([[False, False], [True, False], [False, False]])
|
||||
|
||||
result = df == (2, 2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df == [2, 2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_float_none_comparison(self):
|
||||
df = pd.DataFrame(
|
||||
np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"]
|
||||
)
|
||||
|
||||
result = df.__eq__(None)
|
||||
assert not result.any().any()
|
||||
|
||||
def test_df_string_comparison(self):
|
||||
df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
|
||||
mask_a = df.a > 1
|
||||
tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
|
||||
tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
|
||||
|
||||
mask_b = df.b == "foo"
|
||||
tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
|
||||
tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
|
||||
|
||||
|
||||
class TestFrameFlexComparisons:
|
||||
# TODO: test_bool_flex_frame needs a better name
|
||||
def test_bool_flex_frame(self):
|
||||
data = np.random.randn(5, 3)
|
||||
other_data = np.random.randn(5, 3)
|
||||
df = pd.DataFrame(data)
|
||||
other = pd.DataFrame(other_data)
|
||||
ndim_5 = np.ones(df.shape + (1, 3))
|
||||
|
||||
# Unaligned
|
||||
def _check_unaligned_frame(meth, op, df, other):
|
||||
part_o = other.loc[3:, 1:].copy()
|
||||
rs = meth(part_o)
|
||||
xp = op(df, part_o.reindex(index=df.index, columns=df.columns))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
|
||||
# DataFrame
|
||||
assert df.eq(df).values.all()
|
||||
assert not df.ne(df).values.any()
|
||||
for op in ["eq", "ne", "gt", "lt", "ge", "le"]:
|
||||
f = getattr(df, op)
|
||||
o = getattr(operator, op)
|
||||
# No NAs
|
||||
tm.assert_frame_equal(f(other), o(df, other))
|
||||
_check_unaligned_frame(f, o, df, other)
|
||||
# ndarray
|
||||
tm.assert_frame_equal(f(other.values), o(df, other.values))
|
||||
# scalar
|
||||
tm.assert_frame_equal(f(0), o(df, 0))
|
||||
# NAs
|
||||
msg = "Unable to coerce to Series/DataFrame"
|
||||
tm.assert_frame_equal(f(np.nan), o(df, np.nan))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
f(ndim_5)
|
||||
|
||||
# Series
|
||||
def _test_seq(df, idx_ser, col_ser):
|
||||
idx_eq = df.eq(idx_ser, axis=0)
|
||||
col_eq = df.eq(col_ser)
|
||||
idx_ne = df.ne(idx_ser, axis=0)
|
||||
col_ne = df.ne(col_ser)
|
||||
tm.assert_frame_equal(col_eq, df == pd.Series(col_ser))
|
||||
tm.assert_frame_equal(col_eq, -col_ne)
|
||||
tm.assert_frame_equal(idx_eq, -idx_ne)
|
||||
tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
|
||||
tm.assert_frame_equal(col_eq, df.eq(list(col_ser)))
|
||||
tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0))
|
||||
tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))
|
||||
|
||||
idx_gt = df.gt(idx_ser, axis=0)
|
||||
col_gt = df.gt(col_ser)
|
||||
idx_le = df.le(idx_ser, axis=0)
|
||||
col_le = df.le(col_ser)
|
||||
|
||||
tm.assert_frame_equal(col_gt, df > pd.Series(col_ser))
|
||||
tm.assert_frame_equal(col_gt, -col_le)
|
||||
tm.assert_frame_equal(idx_gt, -idx_le)
|
||||
tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)
|
||||
|
||||
idx_ge = df.ge(idx_ser, axis=0)
|
||||
col_ge = df.ge(col_ser)
|
||||
idx_lt = df.lt(idx_ser, axis=0)
|
||||
col_lt = df.lt(col_ser)
|
||||
tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser))
|
||||
tm.assert_frame_equal(col_ge, -col_lt)
|
||||
tm.assert_frame_equal(idx_ge, -idx_lt)
|
||||
tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)
|
||||
|
||||
idx_ser = pd.Series(np.random.randn(5))
|
||||
col_ser = pd.Series(np.random.randn(3))
|
||||
_test_seq(df, idx_ser, col_ser)
|
||||
|
||||
# list/tuple
|
||||
_test_seq(df, idx_ser.values, col_ser.values)
|
||||
|
||||
# NA
|
||||
df.loc[0, 0] = np.nan
|
||||
rs = df.eq(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.ne(df)
|
||||
assert rs.loc[0, 0]
|
||||
rs = df.gt(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.lt(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.ge(df)
|
||||
assert not rs.loc[0, 0]
|
||||
rs = df.le(df)
|
||||
assert not rs.loc[0, 0]
|
||||
|
||||
# complex
|
||||
arr = np.array([np.nan, 1, 6, np.nan])
|
||||
arr2 = np.array([2j, np.nan, 7, None])
|
||||
df = pd.DataFrame({"a": arr})
|
||||
df2 = pd.DataFrame({"a": arr2})
|
||||
rs = df.gt(df2)
|
||||
assert not rs.values.any()
|
||||
rs = df.ne(df2)
|
||||
assert rs.values.all()
|
||||
|
||||
arr3 = np.array([2j, np.nan, None])
|
||||
df3 = pd.DataFrame({"a": arr3})
|
||||
rs = df3.gt(2j)
|
||||
assert not rs.values.any()
|
||||
|
||||
# corner, dtype=object
|
||||
df1 = pd.DataFrame({"col": ["foo", np.nan, "bar"]})
|
||||
df2 = pd.DataFrame({"col": ["foo", datetime.now(), "bar"]})
|
||||
result = df1.ne(df2)
|
||||
exp = pd.DataFrame({"col": [False, True, False]})
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_flex_comparison_nat(self):
|
||||
# GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT,
|
||||
# and _definitely_ not be NaN
|
||||
df = pd.DataFrame([pd.NaT])
|
||||
|
||||
result = df == pd.NaT
|
||||
# result.iloc[0, 0] is a np.bool_ object
|
||||
assert result.iloc[0, 0].item() is False
|
||||
|
||||
result = df.eq(pd.NaT)
|
||||
assert result.iloc[0, 0].item() is False
|
||||
|
||||
result = df != pd.NaT
|
||||
assert result.iloc[0, 0].item() is True
|
||||
|
||||
result = df.ne(pd.NaT)
|
||||
assert result.iloc[0, 0].item() is True
|
||||
|
||||
@pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"])
|
||||
def test_df_flex_cmp_constant_return_types(self, opname):
|
||||
# GH 15077, non-empty DataFrame
|
||||
df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
|
||||
const = 2
|
||||
|
||||
result = getattr(df, opname)(const).dtypes.value_counts()
|
||||
tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)]))
|
||||
|
||||
@pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"])
|
||||
def test_df_flex_cmp_constant_return_types_empty(self, opname):
|
||||
# GH 15077 empty DataFrame
|
||||
df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
|
||||
const = 2
|
||||
|
||||
empty = df.iloc[:0]
|
||||
result = getattr(empty, opname)(const).dtypes.value_counts()
|
||||
tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)]))
|
||||
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Arithmetic
|
||||
|
||||
|
||||
class TestFrameFlexArithmetic:
|
||||
def test_df_add_td64_columnwise(self):
|
||||
# GH 22534 Check that column-wise addition broadcasts correctly
|
||||
dti = pd.date_range("2016-01-01", periods=10)
|
||||
tdi = pd.timedelta_range("1", periods=10)
|
||||
tser = pd.Series(tdi)
|
||||
df = pd.DataFrame({0: dti, 1: tdi})
|
||||
|
||||
result = df.add(tser, axis=0)
|
||||
expected = pd.DataFrame({0: dti + tdi, 1: tdi + tdi})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_add_flex_filled_mixed_dtypes(self):
|
||||
# GH 19611
|
||||
dti = pd.date_range("2016-01-01", periods=3)
|
||||
ser = pd.Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]")
|
||||
df = pd.DataFrame({"A": dti, "B": ser})
|
||||
other = pd.DataFrame({"A": ser, "B": ser})
|
||||
fill = pd.Timedelta(days=1).to_timedelta64()
|
||||
result = df.add(other, fill_value=fill)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": pd.Series(
|
||||
["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]"
|
||||
),
|
||||
"B": ser * 2,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_flex_frame(
|
||||
self, all_arithmetic_operators, float_frame, mixed_float_frame
|
||||
):
|
||||
# one instance of parametrized fixture
|
||||
op = all_arithmetic_operators
|
||||
|
||||
def f(x, y):
|
||||
# r-versions not in operator-stdlib; get op without "r" and invert
|
||||
if op.startswith("__r"):
|
||||
return getattr(operator, op.replace("__r", "__"))(y, x)
|
||||
return getattr(operator, op)(x, y)
|
||||
|
||||
result = getattr(float_frame, op)(2 * float_frame)
|
||||
expected = f(float_frame, 2 * float_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# vs mix float
|
||||
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
|
||||
expected = f(mixed_float_frame, 2 * mixed_float_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
@pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"])
|
||||
def test_arith_flex_frame_mixed(
|
||||
self, op, int_frame, mixed_int_frame, mixed_float_frame
|
||||
):
|
||||
f = getattr(operator, op)
|
||||
|
||||
# vs mix int
|
||||
result = getattr(mixed_int_frame, op)(2 + mixed_int_frame)
|
||||
expected = f(mixed_int_frame, 2 + mixed_int_frame)
|
||||
|
||||
# no overflow in the uint
|
||||
dtype = None
|
||||
if op in ["__sub__"]:
|
||||
dtype = dict(B="uint64", C=None)
|
||||
elif op in ["__add__", "__mul__"]:
|
||||
dtype = dict(C=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
_check_mixed_int(result, dtype=dtype)
|
||||
|
||||
# vs mix float
|
||||
result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
|
||||
expected = f(mixed_float_frame, 2 * mixed_float_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
# vs plain int
|
||||
result = getattr(int_frame, op)(2 * int_frame)
|
||||
expected = f(int_frame, 2 * int_frame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame):
|
||||
# one instance of parametrized fixture
|
||||
op = all_arithmetic_operators
|
||||
|
||||
# Check that arrays with dim >= 3 raise
|
||||
for dim in range(3, 6):
|
||||
arr = np.ones((1,) * dim)
|
||||
msg = "Unable to coerce to Series/DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(float_frame, op)(arr)
|
||||
|
||||
def test_arith_flex_frame_corner(self, float_frame):
|
||||
|
||||
const_add = float_frame.add(1)
|
||||
tm.assert_frame_equal(const_add, float_frame + 1)
|
||||
|
||||
# corner cases
|
||||
result = float_frame.add(float_frame[:0])
|
||||
tm.assert_frame_equal(result, float_frame * np.nan)
|
||||
|
||||
result = float_frame[:0].add(float_frame)
|
||||
tm.assert_frame_equal(result, float_frame * np.nan)
|
||||
|
||||
with pytest.raises(NotImplementedError, match="fill_value"):
|
||||
float_frame.add(float_frame.iloc[0], fill_value=3)
|
||||
|
||||
with pytest.raises(NotImplementedError, match="fill_value"):
|
||||
float_frame.add(float_frame.iloc[0], axis="index", fill_value=3)
|
||||
|
||||
def test_arith_flex_series(self, simple_frame):
|
||||
df = simple_frame
|
||||
|
||||
row = df.xs("a")
|
||||
col = df["two"]
|
||||
# after arithmetic refactor, add truediv here
|
||||
ops = ["add", "sub", "mul", "mod"]
|
||||
for op in ops:
|
||||
f = getattr(df, op)
|
||||
op = getattr(operator, op)
|
||||
tm.assert_frame_equal(f(row), op(df, row))
|
||||
tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T)
|
||||
|
||||
# special case for some reason
|
||||
tm.assert_frame_equal(df.add(row, axis=None), df + row)
|
||||
|
||||
# cases which will be refactored after big arithmetic refactor
|
||||
tm.assert_frame_equal(df.div(row), df / row)
|
||||
tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T)
|
||||
|
||||
# broadcasting issue in GH 7325
|
||||
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64")
|
||||
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
|
||||
result = df.div(df[0], axis="index")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64")
|
||||
expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
|
||||
result = df.div(df[0], axis="index")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_flex_zero_len_raises(self):
|
||||
# GH 19522 passing fill_value to frame flex arith methods should
|
||||
# raise even in the zero-length special cases
|
||||
ser_len0 = pd.Series([])
|
||||
df_len0 = pd.DataFrame(columns=["A", "B"])
|
||||
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
|
||||
|
||||
with pytest.raises(NotImplementedError, match="fill_value"):
|
||||
df.add(ser_len0, fill_value="E")
|
||||
|
||||
with pytest.raises(NotImplementedError, match="fill_value"):
|
||||
df_len0.sub(df["A"], axis=None, fill_value=3)
|
||||
|
||||
|
||||
class TestFrameArithmetic:
|
||||
def test_df_add_2d_array_rowlike_broadcasts(self):
|
||||
# GH#23000
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
|
||||
|
||||
rowlike = arr[[1], :] # shape --> (1, ncols)
|
||||
assert rowlike.shape == (1, df.shape[1])
|
||||
|
||||
expected = pd.DataFrame(
|
||||
[[2, 4], [4, 6], [6, 8]],
|
||||
columns=df.columns,
|
||||
index=df.index,
|
||||
# specify dtype explicitly to avoid failing
|
||||
# on 32bit builds
|
||||
dtype=arr.dtype,
|
||||
)
|
||||
result = df + rowlike
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = rowlike + df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_add_2d_array_collike_broadcasts(self):
|
||||
# GH#23000
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
|
||||
|
||||
collike = arr[:, [1]] # shape --> (nrows, 1)
|
||||
assert collike.shape == (df.shape[0], 1)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
[[1, 2], [5, 6], [9, 10]],
|
||||
columns=df.columns,
|
||||
index=df.index,
|
||||
# specify dtype explicitly to avoid failing
|
||||
# on 32bit builds
|
||||
dtype=arr.dtype,
|
||||
)
|
||||
result = df + collike
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = collike + df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators):
|
||||
# GH#23000
|
||||
opname = all_arithmetic_operators
|
||||
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
|
||||
|
||||
rowlike = arr[[1], :] # shape --> (1, ncols)
|
||||
assert rowlike.shape == (1, df.shape[1])
|
||||
|
||||
exvals = [
|
||||
getattr(df.loc["A"], opname)(rowlike.squeeze()),
|
||||
getattr(df.loc["B"], opname)(rowlike.squeeze()),
|
||||
getattr(df.loc["C"], opname)(rowlike.squeeze()),
|
||||
]
|
||||
|
||||
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index)
|
||||
|
||||
if opname in ["__rmod__", "__rfloordiv__"]:
|
||||
# exvals will have dtypes [f8, i8, i8] so expected will be
|
||||
# all-f8, but the DataFrame operation will return mixed dtypes
|
||||
# use exvals[-1].dtype instead of "i8" for compat with 32-bit
|
||||
# systems/pythons
|
||||
expected[False] = expected[False].astype(exvals[-1].dtype)
|
||||
|
||||
result = getattr(df, opname)(rowlike)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators):
|
||||
# GH#23000
|
||||
opname = all_arithmetic_operators
|
||||
|
||||
arr = np.arange(6).reshape(3, 2)
|
||||
df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
|
||||
|
||||
collike = arr[:, [1]] # shape --> (nrows, 1)
|
||||
assert collike.shape == (df.shape[0], 1)
|
||||
|
||||
exvals = {
|
||||
True: getattr(df[True], opname)(collike.squeeze()),
|
||||
False: getattr(df[False], opname)(collike.squeeze()),
|
||||
}
|
||||
|
||||
dtype = None
|
||||
if opname in ["__rmod__", "__rfloordiv__"]:
|
||||
# Series ops may return mixed int/float dtypes in cases where
|
||||
# DataFrame op will return all-float. So we upcast `expected`
|
||||
dtype = np.common_type(*[x.values for x in exvals.values()])
|
||||
|
||||
expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype)
|
||||
|
||||
result = getattr(df, opname)(collike)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_df_bool_mul_int(self):
|
||||
# GH 22047, GH 22163 multiplication by 1 should result in int dtype,
|
||||
# not object dtype
|
||||
df = pd.DataFrame([[False, True], [False, False]])
|
||||
result = df * 1
|
||||
|
||||
# On appveyor this comes back as np.int32 instead of np.int64,
|
||||
# so we check dtype.kind instead of just dtype
|
||||
kinds = result.dtypes.apply(lambda x: x.kind)
|
||||
assert (kinds == "i").all()
|
||||
|
||||
result = 1 * df
|
||||
kinds = result.dtypes.apply(lambda x: x.kind)
|
||||
assert (kinds == "i").all()
|
||||
|
||||
def test_arith_mixed(self):
|
||||
|
||||
left = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]})
|
||||
|
||||
result = left + left
|
||||
expected = pd.DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_getitem_commute(self):
|
||||
df = pd.DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]})
|
||||
|
||||
def _test_op(df, op):
|
||||
result = op(df, 1)
|
||||
|
||||
if not df.columns.is_unique:
|
||||
raise ValueError("Only unique columns supported by this test")
|
||||
|
||||
for col in result.columns:
|
||||
tm.assert_series_equal(result[col], op(df[col], 1))
|
||||
|
||||
_test_op(df, operator.add)
|
||||
_test_op(df, operator.sub)
|
||||
_test_op(df, operator.mul)
|
||||
_test_op(df, operator.truediv)
|
||||
_test_op(df, operator.floordiv)
|
||||
_test_op(df, operator.pow)
|
||||
|
||||
_test_op(df, lambda x, y: y + x)
|
||||
_test_op(df, lambda x, y: y - x)
|
||||
_test_op(df, lambda x, y: y * x)
|
||||
_test_op(df, lambda x, y: y / x)
|
||||
_test_op(df, lambda x, y: y ** x)
|
||||
|
||||
_test_op(df, lambda x, y: x + y)
|
||||
_test_op(df, lambda x, y: x - y)
|
||||
_test_op(df, lambda x, y: x * y)
|
||||
_test_op(df, lambda x, y: x / y)
|
||||
_test_op(df, lambda x, y: x ** y)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])]
|
||||
)
|
||||
def test_arith_alignment_non_pandas_object(self, values):
|
||||
# GH#17901
|
||||
df = pd.DataFrame({"A": [1, 1], "B": [1, 1]})
|
||||
expected = pd.DataFrame({"A": [2, 2], "B": [3, 3]})
|
||||
result = df + values
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_arith_non_pandas_object(self):
|
||||
df = pd.DataFrame(
|
||||
np.arange(1, 10, dtype="f8").reshape(3, 3),
|
||||
columns=["one", "two", "three"],
|
||||
index=["a", "b", "c"],
|
||||
)
|
||||
|
||||
val1 = df.xs("a").values
|
||||
added = pd.DataFrame(df.values + val1, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df + val1, added)
|
||||
|
||||
added = pd.DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df.add(val1, axis=0), added)
|
||||
|
||||
val2 = list(df["two"])
|
||||
|
||||
added = pd.DataFrame(df.values + val2, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df + val2, added)
|
||||
|
||||
added = pd.DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df.add(val2, axis="index"), added)
|
||||
|
||||
val3 = np.random.rand(*df.shape)
|
||||
added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns)
|
||||
tm.assert_frame_equal(df.add(val3), added)
|
||||
134
venv/lib/python3.6/site-packages/pandas/tests/frame/test_asof.py
Normal file
134
venv/lib/python3.6/site-packages/pandas/tests/frame/test_asof.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Series, Timestamp, date_range, to_datetime
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def date_range_frame():
|
||||
"""
|
||||
Fixture for DataFrame of ints with date_range index
|
||||
|
||||
Columns are ['A', 'B'].
|
||||
"""
|
||||
N = 50
|
||||
rng = date_range("1/1/1990", periods=N, freq="53s")
|
||||
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
|
||||
|
||||
|
||||
class TestFrameAsof:
|
||||
def test_basic(self, date_range_frame):
|
||||
df = date_range_frame
|
||||
N = 50
|
||||
df.loc[15:30, "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
lb = df.index[14]
|
||||
ub = df.index[30]
|
||||
|
||||
dates = list(dates)
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
|
||||
mask = (result.index >= lb) & (result.index < ub)
|
||||
rs = result[mask]
|
||||
assert (rs == 14).all(1).all()
|
||||
|
||||
def test_subset(self, date_range_frame):
|
||||
N = 10
|
||||
df = date_range_frame.iloc[:N].copy()
|
||||
df.loc[4:8, "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
# with a subset of A should be the same
|
||||
result = df.asof(dates, subset="A")
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same with A/B
|
||||
result = df.asof(dates, subset=["A", "B"])
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# B gives df.asof
|
||||
result = df.asof(dates, subset="B")
|
||||
expected = df.resample("25s", closed="right").ffill().reindex(dates)
|
||||
expected.iloc[20:] = 9
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing(self, date_range_frame):
|
||||
# GH 15118
|
||||
# no match found - `where` value before earliest date in index
|
||||
N = 10
|
||||
df = date_range_frame.iloc[:N].copy()
|
||||
result = df.asof("1989-12-31")
|
||||
|
||||
expected = Series(index=["A", "B"], name=Timestamp("1989-12-31"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.asof(to_datetime(["1989-12-31"]))
|
||||
expected = DataFrame(
|
||||
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_all_nans(self, date_range_frame):
|
||||
# GH 15713
|
||||
# DataFrame is all nans
|
||||
result = DataFrame([np.nan]).asof([0])
|
||||
expected = DataFrame([np.nan])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing non-default indexes, multiple inputs
|
||||
N = 150
|
||||
rng = date_range_frame.index
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing multiple columns
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing scalar input
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
|
||||
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
|
||||
expected = Series(np.nan, index=["A", "B"], name=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stamp,expected",
|
||||
[
|
||||
(
|
||||
Timestamp("2018-01-01 23:22:43.325+00:00"),
|
||||
Series(2.0, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
|
||||
),
|
||||
(
|
||||
Timestamp("2018-01-01 22:33:20.682+01:00"),
|
||||
Series(1.0, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_time_zone_aware_index(self, stamp, expected):
|
||||
# GH21194
|
||||
# Testing awareness of DataFrame index considering different
|
||||
# UTC and timezone
|
||||
df = DataFrame(
|
||||
data=[1, 2],
|
||||
index=[
|
||||
Timestamp("2018-01-01 21:00:05.001+00:00"),
|
||||
Timestamp("2018-01-01 22:35:10.550+00:00"),
|
||||
],
|
||||
)
|
||||
result = df.asof(stamp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,639 @@
|
||||
from datetime import datetime, timedelta
|
||||
from io import StringIO
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
compat,
|
||||
date_range,
|
||||
option_context,
|
||||
)
|
||||
from pandas.core.arrays import IntervalArray, integer_array
|
||||
from pandas.core.internals import ObjectBlock
|
||||
from pandas.core.internals.blocks import IntBlock
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_almost_equal,
|
||||
assert_frame_equal,
|
||||
assert_series_equal,
|
||||
)
|
||||
|
||||
# Segregated collection of methods that require the BlockManager internal data
|
||||
# structure
|
||||
|
||||
|
||||
class TestDataFrameBlockInternals:
|
||||
def test_setitem_invalidates_datetime_index_freq(self):
|
||||
# GH#24096 altering a datetime64tz column inplace invalidates the
|
||||
# `freq` attribute on the underlying DatetimeIndex
|
||||
|
||||
dti = date_range("20130101", periods=3, tz="US/Eastern")
|
||||
ts = dti[1]
|
||||
|
||||
df = DataFrame({"B": dti})
|
||||
assert df["B"]._values.freq == "D"
|
||||
|
||||
df.iloc[1, 0] = pd.NaT
|
||||
assert df["B"]._values.freq is None
|
||||
|
||||
# check that the DatetimeIndex was not altered in place
|
||||
assert dti.freq == "D"
|
||||
assert dti[1] == ts
|
||||
|
||||
def test_cast_internals(self, float_frame):
|
||||
casted = DataFrame(float_frame._data, dtype=int)
|
||||
expected = DataFrame(float_frame._series, dtype=int)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
casted = DataFrame(float_frame._data, dtype=np.int32)
|
||||
expected = DataFrame(float_frame._series, dtype=np.int32)
|
||||
assert_frame_equal(casted, expected)
|
||||
|
||||
def test_consolidate(self, float_frame):
|
||||
float_frame["E"] = 7.0
|
||||
consolidated = float_frame._consolidate()
|
||||
assert len(consolidated._data.blocks) == 1
|
||||
|
||||
# Ensure copy, do I want this?
|
||||
recons = consolidated._consolidate()
|
||||
assert recons is not consolidated
|
||||
tm.assert_frame_equal(recons, consolidated)
|
||||
|
||||
float_frame["F"] = 8.0
|
||||
assert len(float_frame._data.blocks) == 3
|
||||
|
||||
float_frame._consolidate(inplace=True)
|
||||
assert len(float_frame._data.blocks) == 1
|
||||
|
||||
def test_consolidate_inplace(self, float_frame):
|
||||
frame = float_frame.copy() # noqa
|
||||
|
||||
# triggers in-place consolidation
|
||||
for letter in range(ord("A"), ord("Z")):
|
||||
float_frame[chr(letter)] = chr(letter)
|
||||
|
||||
def test_values_consolidate(self, float_frame):
|
||||
float_frame["E"] = 7.0
|
||||
assert not float_frame._data.is_consolidated()
|
||||
_ = float_frame.values # noqa
|
||||
assert float_frame._data.is_consolidated()
|
||||
|
||||
def test_modify_values(self, float_frame):
|
||||
float_frame.values[5] = 5
|
||||
assert (float_frame.values[5] == 5).all()
|
||||
|
||||
# unconsolidated
|
||||
float_frame["E"] = 7.0
|
||||
float_frame.values[6] = 6
|
||||
assert (float_frame.values[6] == 6).all()
|
||||
|
||||
def test_boolean_set_uncons(self, float_frame):
|
||||
float_frame["E"] = 7.0
|
||||
|
||||
expected = float_frame.values.copy()
|
||||
expected[expected > 1] = 2
|
||||
|
||||
float_frame[float_frame > 1] = 2
|
||||
assert_almost_equal(expected, float_frame.values)
|
||||
|
||||
def test_values_numeric_cols(self, float_frame):
|
||||
float_frame["foo"] = "bar"
|
||||
|
||||
values = float_frame[["A", "B", "C", "D"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
|
||||
|
||||
# mixed lcd
|
||||
values = mixed_float_frame[["A", "B", "C", "D"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = mixed_float_frame[["A", "B", "C"]].values
|
||||
assert values.dtype == np.float32
|
||||
|
||||
values = mixed_float_frame[["C"]].values
|
||||
assert values.dtype == np.float16
|
||||
|
||||
# GH 10364
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = mixed_int_frame[["A", "B", "C", "D"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
values = mixed_int_frame[["A", "D"]].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
# B uint64 forces float because there are other signed int types
|
||||
values = mixed_int_frame[["A", "B", "C"]].values
|
||||
assert values.dtype == np.float64
|
||||
|
||||
# as B and C are both unsigned, no forcing to float is needed
|
||||
values = mixed_int_frame[["B", "C"]].values
|
||||
assert values.dtype == np.uint64
|
||||
|
||||
values = mixed_int_frame[["A", "C"]].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = mixed_int_frame[["C", "D"]].values
|
||||
assert values.dtype == np.int64
|
||||
|
||||
values = mixed_int_frame[["A"]].values
|
||||
assert values.dtype == np.int32
|
||||
|
||||
values = mixed_int_frame[["C"]].values
|
||||
assert values.dtype == np.uint8
|
||||
|
||||
def test_constructor_with_convert(self):
|
||||
# this is actually mostly a test of lib.maybe_convert_objects
|
||||
# #2845
|
||||
df = DataFrame({"A": [2 ** 63 - 1]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([2 ** 63 - 1], np.int64), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [2 ** 63]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([2 ** 63], np.uint64), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [datetime(2005, 1, 1), True]})
|
||||
result = df["A"]
|
||||
expected = Series(
|
||||
np.asarray([datetime(2005, 1, 1), True], np.object_), name="A"
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [None, 1]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([np.nan, 1], np.float_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [1.0, 2]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([1.0, 2], np.float_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [1.0 + 2.0j, 3]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [1.0 + 2.0j, 3.0]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [1.0 + 2.0j, True]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [1.0, None]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([1.0, np.nan], np.float_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [1.0 + 2.0j, None]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [2.0, 1, True, None]})
|
||||
result = df["A"]
|
||||
expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]})
|
||||
result = df["A"]
|
||||
expected = Series(
|
||||
np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A"
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_construction_with_mixed(self, float_string_frame):
|
||||
# test construction edge cases with mixed types
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [
|
||||
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
|
||||
]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check dtypes
|
||||
result = df.dtypes
|
||||
expected = Series({"datetime64[ns]": 3})
|
||||
|
||||
# mixed-type frames
|
||||
float_string_frame["datetime"] = datetime.now()
|
||||
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
|
||||
assert float_string_frame["datetime"].dtype == "M8[ns]"
|
||||
assert float_string_frame["timedelta"].dtype == "m8[ns]"
|
||||
result = float_string_frame.dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 4
|
||||
+ [
|
||||
np.dtype("object"),
|
||||
np.dtype("datetime64[ns]"),
|
||||
np.dtype("timedelta64[ns]"),
|
||||
],
|
||||
index=list("ABCD") + ["foo", "datetime", "timedelta"],
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_construction_with_conversions(self):
|
||||
|
||||
# convert from a numpy array of non-ns timedelta64
|
||||
arr = np.array([1, 2, 3], dtype="timedelta64[s]")
|
||||
df = DataFrame(index=range(3))
|
||||
df["A"] = arr
|
||||
expected = DataFrame(
|
||||
{"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"dt1": Timestamp("20130101"),
|
||||
"dt2": date_range("20130101", periods=3),
|
||||
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
|
||||
},
|
||||
index=range(3),
|
||||
)
|
||||
|
||||
df = DataFrame(index=range(3))
|
||||
df["dt1"] = np.datetime64("2013-01-01")
|
||||
df["dt2"] = np.array(
|
||||
["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]"
|
||||
)
|
||||
|
||||
# df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
|
||||
# 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_constructor_compound_dtypes(self):
|
||||
# GH 5191
|
||||
# compound dtypes should raise not-implementederror
|
||||
|
||||
def f(dtype):
|
||||
data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9))
|
||||
return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype)
|
||||
|
||||
msg = "compound dtypes are not implemented in the DataFrame constructor"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
|
||||
|
||||
# these work (though results may be unexpected)
|
||||
f("int64")
|
||||
f("float64")
|
||||
|
||||
# 10822
|
||||
# invalid error message on dt inference
|
||||
if not compat.is_platform_windows():
|
||||
f("M8[ns]")
|
||||
|
||||
def test_equals_different_blocks(self):
|
||||
# GH 9330
|
||||
df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
|
||||
df1 = df0.reset_index()[["A", "B", "C"]]
|
||||
# this assert verifies that the above operations have
|
||||
# induced a block rearrangement
|
||||
assert df0._data.blocks[0].dtype != df1._data.blocks[0].dtype
|
||||
|
||||
# do the real tests
|
||||
assert_frame_equal(df0, df1)
|
||||
assert df0.equals(df1)
|
||||
assert df1.equals(df0)
|
||||
|
||||
def test_copy_blocks(self, float_frame):
|
||||
# API/ENH 9607
|
||||
df = DataFrame(float_frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
# use the default copy=True, change a column
|
||||
|
||||
# deprecated 0.21.0
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
blocks = df.as_blocks()
|
||||
for dtype, _df in blocks.items():
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
# make sure we did not change the original DataFrame
|
||||
assert not _df[column].equals(df[column])
|
||||
|
||||
def test_no_copy_blocks(self, float_frame):
|
||||
# API/ENH 9607
|
||||
df = DataFrame(float_frame, copy=True)
|
||||
column = df.columns[0]
|
||||
|
||||
# use the copy=False, change a column
|
||||
|
||||
# deprecated 0.21.0
|
||||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
||||
blocks = df.as_blocks(copy=False)
|
||||
for dtype, _df in blocks.items():
|
||||
if column in _df:
|
||||
_df.loc[:, column] = _df[column] + 1
|
||||
|
||||
# make sure we did change the original DataFrame
|
||||
assert _df[column].equals(df[column])
|
||||
|
||||
def test_copy(self, float_frame, float_string_frame):
|
||||
cop = float_frame.copy()
|
||||
cop["E"] = cop["A"]
|
||||
assert "E" not in float_frame
|
||||
|
||||
# copy objects
|
||||
copy = float_string_frame.copy()
|
||||
assert copy._data is not float_string_frame._data
|
||||
|
||||
def test_pickle(self, float_string_frame, timezone_frame):
|
||||
empty_frame = DataFrame()
|
||||
|
||||
unpickled = tm.round_trip_pickle(float_string_frame)
|
||||
assert_frame_equal(float_string_frame, unpickled)
|
||||
|
||||
# buglet
|
||||
float_string_frame._data.ndim
|
||||
|
||||
# empty
|
||||
unpickled = tm.round_trip_pickle(empty_frame)
|
||||
repr(unpickled)
|
||||
|
||||
# tz frame
|
||||
unpickled = tm.round_trip_pickle(timezone_frame)
|
||||
assert_frame_equal(timezone_frame, unpickled)
|
||||
|
||||
def test_consolidate_datetime64(self):
|
||||
# numpy vstack bug
|
||||
|
||||
data = """\
|
||||
starting,ending,measure
|
||||
2012-06-21 00:00,2012-06-23 07:00,77
|
||||
2012-06-23 07:00,2012-06-23 16:30,65
|
||||
2012-06-23 16:30,2012-06-25 08:00,77
|
||||
2012-06-25 08:00,2012-06-26 12:00,0
|
||||
2012-06-26 12:00,2012-06-27 08:00,77
|
||||
"""
|
||||
df = pd.read_csv(StringIO(data), parse_dates=[0, 1])
|
||||
|
||||
ser_starting = df.starting
|
||||
ser_starting.index = ser_starting.values
|
||||
ser_starting = ser_starting.tz_localize("US/Eastern")
|
||||
ser_starting = ser_starting.tz_convert("UTC")
|
||||
ser_starting.index.name = "starting"
|
||||
|
||||
ser_ending = df.ending
|
||||
ser_ending.index = ser_ending.values
|
||||
ser_ending = ser_ending.tz_localize("US/Eastern")
|
||||
ser_ending = ser_ending.tz_convert("UTC")
|
||||
ser_ending.index.name = "ending"
|
||||
|
||||
df.starting = ser_starting.index
|
||||
df.ending = ser_ending.index
|
||||
|
||||
tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index)
|
||||
tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
|
||||
|
||||
def test_is_mixed_type(self, float_frame, float_string_frame):
|
||||
assert not float_frame._is_mixed_type
|
||||
assert float_string_frame._is_mixed_type
|
||||
|
||||
def test_get_numeric_data(self):
|
||||
# TODO(wesm): unused?
|
||||
intname = np.dtype(np.int_).name # noqa
|
||||
floatname = np.dtype(np.float_).name # noqa
|
||||
|
||||
datetime64name = np.dtype("M8[ns]").name
|
||||
objectname = np.dtype(np.object_).name
|
||||
|
||||
df = DataFrame(
|
||||
{"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")},
|
||||
index=np.arange(10),
|
||||
)
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[
|
||||
np.dtype("float64"),
|
||||
np.dtype("int64"),
|
||||
np.dtype(objectname),
|
||||
np.dtype(datetime64name),
|
||||
],
|
||||
index=["a", "b", "c", "f"],
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"d": np.array([1.0] * 10, dtype="float32"),
|
||||
"e": np.array([1] * 10, dtype="int32"),
|
||||
"f": np.array([1] * 10, dtype="int16"),
|
||||
"g": Timestamp("20010102"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ["a", "b", "d", "e", "f"]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
only_obj = df.loc[:, ["c", "g"]]
|
||||
result = only_obj._get_numeric_data()
|
||||
expected = df.loc[:, []]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]})
|
||||
result = df._get_numeric_data()
|
||||
expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df = result.copy()
|
||||
result = df._get_numeric_data()
|
||||
expected = df
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_get_numeric_data_extension_dtype(self):
|
||||
# GH 22290
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"),
|
||||
"B": Categorical(list("abcabc")),
|
||||
"C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"),
|
||||
"D": IntervalArray.from_breaks(range(7)),
|
||||
}
|
||||
)
|
||||
result = df._get_numeric_data()
|
||||
expected = df.loc[:, ["A", "C"]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_objects(self, float_string_frame):
|
||||
|
||||
oops = float_string_frame.T.T
|
||||
converted = oops._convert(datetime=True)
|
||||
assert_frame_equal(converted, float_string_frame)
|
||||
assert converted["A"].dtype == np.float64
|
||||
|
||||
# force numeric conversion
|
||||
float_string_frame["H"] = "1."
|
||||
float_string_frame["I"] = "1"
|
||||
|
||||
# add in some items that will be nan
|
||||
length = len(float_string_frame)
|
||||
float_string_frame["J"] = "1."
|
||||
float_string_frame["K"] = "1"
|
||||
float_string_frame.loc[0:5, ["J", "K"]] = "garbled"
|
||||
converted = float_string_frame._convert(datetime=True, numeric=True)
|
||||
assert converted["H"].dtype == "float64"
|
||||
assert converted["I"].dtype == "int64"
|
||||
assert converted["J"].dtype == "float64"
|
||||
assert converted["K"].dtype == "float64"
|
||||
assert len(converted["J"].dropna()) == length - 5
|
||||
assert len(converted["K"].dropna()) == length - 5
|
||||
|
||||
# via astype
|
||||
converted = float_string_frame.copy()
|
||||
converted["H"] = converted["H"].astype("float64")
|
||||
converted["I"] = converted["I"].astype("int64")
|
||||
assert converted["H"].dtype == "float64"
|
||||
assert converted["I"].dtype == "int64"
|
||||
|
||||
# via astype, but errors
|
||||
converted = float_string_frame.copy()
|
||||
with pytest.raises(ValueError, match="invalid literal"):
|
||||
converted["H"].astype("int32")
|
||||
|
||||
# mixed in a single column
|
||||
df = DataFrame(dict(s=Series([1, "na", 3, 4])))
|
||||
result = df._convert(datetime=True, numeric=True)
|
||||
expected = DataFrame(dict(s=Series([1, np.nan, 3, 4])))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_objects_no_conversion(self):
|
||||
mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]})
|
||||
mixed2 = mixed1._convert(datetime=True)
|
||||
assert_frame_equal(mixed1, mixed2)
|
||||
|
||||
def test_infer_objects(self):
|
||||
# GH 11221
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": ["a", 1, 2, 3],
|
||||
"b": ["b", 2.0, 3.0, 4.1],
|
||||
"c": [
|
||||
"c",
|
||||
datetime(2016, 1, 1),
|
||||
datetime(2016, 1, 2),
|
||||
datetime(2016, 1, 3),
|
||||
],
|
||||
"d": [1, 2, 3, "d"],
|
||||
},
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
df = df.iloc[1:].infer_objects()
|
||||
|
||||
assert df["a"].dtype == "int64"
|
||||
assert df["b"].dtype == "float64"
|
||||
assert df["c"].dtype == "M8[ns]"
|
||||
assert df["d"].dtype == "object"
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [2.0, 3.0, 4.1],
|
||||
"c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)],
|
||||
"d": [2, 3, "d"],
|
||||
},
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
# reconstruct frame to verify inference is same
|
||||
tm.assert_frame_equal(df.reset_index(drop=True), expected)
|
||||
|
||||
def test_stale_cached_series_bug_473(self):
|
||||
|
||||
# this is chained, but ok
|
||||
with option_context("chained_assignment", None):
|
||||
Y = DataFrame(
|
||||
np.random.random((4, 4)),
|
||||
index=("a", "b", "c", "d"),
|
||||
columns=("e", "f", "g", "h"),
|
||||
)
|
||||
repr(Y)
|
||||
Y["e"] = Y["e"].astype("object")
|
||||
Y["g"]["c"] = np.NaN
|
||||
repr(Y)
|
||||
result = Y.sum() # noqa
|
||||
exp = Y["g"].sum() # noqa
|
||||
assert pd.isna(Y["g"]["c"])
|
||||
|
||||
def test_get_X_columns(self):
|
||||
# numeric and object columns
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [True, False, True],
|
||||
"c": ["foo", "bar", "baz"],
|
||||
"d": [None, None, None],
|
||||
"e": [3.14, 0.577, 2.773],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"]))
|
||||
|
||||
def test_strange_column_corruption_issue(self):
|
||||
# (wesm) Unclear how exactly this is related to internal matters
|
||||
df = DataFrame(index=[0, 1])
|
||||
df[0] = np.nan
|
||||
wasCol = {}
|
||||
# uncommenting these makes the results match
|
||||
# for col in xrange(100, 200):
|
||||
# wasCol[col] = 1
|
||||
# df[col] = np.nan
|
||||
|
||||
for i, dt in enumerate(df.index):
|
||||
for col in range(100, 200):
|
||||
if col not in wasCol:
|
||||
wasCol[col] = 1
|
||||
df[col] = np.nan
|
||||
df[col][dt] = i
|
||||
|
||||
myid = 100
|
||||
|
||||
first = len(df.loc[pd.isna(df[myid]), [myid]])
|
||||
second = len(df.loc[pd.isna(df[myid]), [myid]])
|
||||
assert first == second == 0
|
||||
|
||||
def test_constructor_no_pandas_array(self):
|
||||
# Ensure that PandasArray isn't allowed inside Series
|
||||
# See https://github.com/pandas-dev/pandas/issues/23995 for more.
|
||||
arr = pd.Series([1, 2, 3]).array
|
||||
result = pd.DataFrame({"A": arr})
|
||||
expected = pd.DataFrame({"A": [1, 2, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert isinstance(result._data.blocks[0], IntBlock)
|
||||
|
||||
def test_add_column_with_pandas_array(self):
|
||||
# GH 26390
|
||||
df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
|
||||
df["c"] = pd.array([1, 2, None, 3])
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 4],
|
||||
"b": ["a", "b", "c", "d"],
|
||||
"c": pd.array([1, 2, None, 3]),
|
||||
}
|
||||
)
|
||||
assert type(df["c"]._data.blocks[0]) == ObjectBlock
|
||||
assert type(df2["c"]._data.blocks[0]) == ObjectBlock
|
||||
assert_frame_equal(df, df2)
|
||||
@@ -0,0 +1,952 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, Series, Timestamp, date_range
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameConcatCommon:
|
||||
def test_concat_multiple_frames_dtypes(self):
|
||||
|
||||
# GH 2759
|
||||
A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64)
|
||||
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
|
||||
results = pd.concat((A, B), axis=1).dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 2 + [np.dtype("float32")] * 2,
|
||||
index=["foo", "bar", 0, 1],
|
||||
)
|
||||
assert_series_equal(results, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.date_range("2000", periods=4),
|
||||
pd.date_range("2000", periods=4, tz="US/Central"),
|
||||
pd.period_range("2000", periods=4),
|
||||
pd.timedelta_range(0, periods=4),
|
||||
],
|
||||
)
|
||||
def test_combine_datetlike_udf(self, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/23079
|
||||
df = pd.DataFrame({"A": data})
|
||||
other = df.copy()
|
||||
df.iloc[1, 0] = None
|
||||
|
||||
def combiner(a, b):
|
||||
return b
|
||||
|
||||
result = df.combine(other, combiner)
|
||||
tm.assert_frame_equal(result, other)
|
||||
|
||||
def test_concat_multiple_tzs(self):
|
||||
# GH 12467
|
||||
# combining datetime tz-aware and naive DataFrames
|
||||
ts1 = Timestamp("2015-01-01", tz=None)
|
||||
ts2 = Timestamp("2015-01-01", tz="UTC")
|
||||
ts3 = Timestamp("2015-01-01", tz="EST")
|
||||
|
||||
df1 = DataFrame(dict(time=[ts1]))
|
||||
df2 = DataFrame(dict(time=[ts2]))
|
||||
df3 = DataFrame(dict(time=[ts3]))
|
||||
|
||||
results = pd.concat([df1, df2]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
results = pd.concat([df1, df3]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
results = pd.concat([df2, df3]).reset_index(drop=True)
|
||||
expected = DataFrame(dict(time=[ts2, ts3]))
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"t1",
|
||||
[
|
||||
"2015-01-01",
|
||||
pytest.param(
|
||||
pd.NaT,
|
||||
marks=pytest.mark.xfail(
|
||||
reason="GH23037 incorrect dtype when concatenating"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_concat_tz_NaT(self, t1):
|
||||
# GH 22796
|
||||
# Concating tz-aware multicolumn DataFrames
|
||||
ts1 = Timestamp(t1, tz="UTC")
|
||||
ts2 = Timestamp("2015-01-01", tz="UTC")
|
||||
ts3 = Timestamp("2015-01-01", tz="UTC")
|
||||
|
||||
df1 = DataFrame([[ts1, ts2]])
|
||||
df2 = DataFrame([[ts3]])
|
||||
|
||||
result = pd.concat([df1, df2])
|
||||
expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0])
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tz_not_aligned(self):
|
||||
# GH 22796
|
||||
ts = pd.to_datetime([1, 2]).tz_localize("UTC")
|
||||
a = pd.DataFrame({"A": ts})
|
||||
b = pd.DataFrame({"A": ts, "B": ts})
|
||||
result = pd.concat([a, b], sort=True, ignore_index=True)
|
||||
expected = pd.DataFrame(
|
||||
{"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_tuple_keys(self):
|
||||
# GH 14438
|
||||
df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB"))
|
||||
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB"))
|
||||
results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")])
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": {
|
||||
("bee", "bah", 0): 1.0,
|
||||
("bee", "bah", 1): 1.0,
|
||||
("bee", "boo", 0): 2.0,
|
||||
("bee", "boo", 1): 2.0,
|
||||
("bee", "boo", 2): 2.0,
|
||||
},
|
||||
"B": {
|
||||
("bee", "bah", 0): 1.0,
|
||||
("bee", "bah", 1): 1.0,
|
||||
("bee", "boo", 0): 2.0,
|
||||
("bee", "boo", 1): 2.0,
|
||||
("bee", "boo", 2): 2.0,
|
||||
},
|
||||
}
|
||||
)
|
||||
assert_frame_equal(results, expected)
|
||||
|
||||
def test_append_series_dict(self):
|
||||
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
|
||||
|
||||
series = df.loc[4]
|
||||
msg = "Indexes have overlapping values"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.append(series, verify_integrity=True)
|
||||
|
||||
series.name = None
|
||||
msg = "Can only append a Series if ignore_index=True"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.append(series, verify_integrity=True)
|
||||
|
||||
result = df.append(series[::-1], ignore_index=True)
|
||||
expected = df.append(
|
||||
DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dict
|
||||
result = df.append(series.to_dict(), ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.append(series[::-1][:3], ignore_index=True)
|
||||
expected = df.append(
|
||||
DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True
|
||||
)
|
||||
assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
# can append when name set
|
||||
row = df.loc[4]
|
||||
row.name = 5
|
||||
result = df.append(row)
|
||||
expected = df.append(df[-1:], ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_list_of_series_dicts(self):
|
||||
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
|
||||
|
||||
dicts = [x.to_dict() for idx, x in df.iterrows()]
|
||||
|
||||
result = df.append(dicts, ignore_index=True)
|
||||
expected = df.append(df, ignore_index=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# different columns
|
||||
dicts = [
|
||||
{"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4},
|
||||
{"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8},
|
||||
]
|
||||
result = df.append(dicts, ignore_index=True, sort=True)
|
||||
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_missing_cols(self):
|
||||
# GH22252
|
||||
# exercise the conditional branch in append method where the data
|
||||
# to be appended is a list and does not contain all columns that are in
|
||||
# the target DataFrame
|
||||
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
|
||||
|
||||
dicts = [{"foo": 9}, {"bar": 10}]
|
||||
with tm.assert_produces_warning(None):
|
||||
result = df.append(dicts, ignore_index=True, sort=True)
|
||||
|
||||
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_empty_dataframe(self):
|
||||
|
||||
# Empty df append empty df
|
||||
df1 = DataFrame()
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Non-empty df append empty df
|
||||
df1 = DataFrame(np.random.randn(5, 2))
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Empty df with columns append empty df
|
||||
df1 = DataFrame(columns=["bar", "foo"])
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Non-Empty df with columns append empty df
|
||||
df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"])
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_append_dtypes(self):
|
||||
|
||||
# GH 5754
|
||||
# row appends of different dtypes (so need to do by-item)
|
||||
# can sometimes infer the correct type
|
||||
|
||||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5))
|
||||
df2 = DataFrame()
|
||||
result = df1.append(df2)
|
||||
expected = df1.copy()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
|
||||
df2 = DataFrame({"bar": "foo"}, index=range(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
|
||||
df2 = DataFrame({"bar": np.nan}, index=range(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
|
||||
df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object)
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"bar": np.nan}, index=range(1))
|
||||
df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2))
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame(
|
||||
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
|
||||
df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object)
|
||||
result = df1.append(df2)
|
||||
expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_update(self):
|
||||
df = DataFrame(
|
||||
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_dtypes(self):
|
||||
|
||||
# gh 3016
|
||||
df = DataFrame(
|
||||
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
|
||||
columns=["A", "B", "bool1", "bool2"],
|
||||
)
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
|
||||
df.update(other)
|
||||
|
||||
expected = DataFrame(
|
||||
[[45.0, 45.0, False, True], [4.0, 5.0, True, False]],
|
||||
columns=["A", "B", "bool1", "bool2"],
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_nooverwrite(self):
|
||||
df = DataFrame(
|
||||
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, overwrite=False)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]]
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_filtered(self):
|
||||
df = DataFrame(
|
||||
[[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3])
|
||||
|
||||
df.update(other, filter_func=lambda x: x > 2)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]]
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bad_kwarg, exception, msg",
|
||||
[
|
||||
# errors must be 'ignore' or 'raise'
|
||||
({"errors": "something"}, ValueError, "The parameter errors must.*"),
|
||||
({"join": "inner"}, NotImplementedError, "Only left join is supported"),
|
||||
],
|
||||
)
|
||||
def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
|
||||
df = DataFrame([[1.5, 1, 3.0]])
|
||||
with pytest.raises(exception, match=msg):
|
||||
df.update(df, **bad_kwarg)
|
||||
|
||||
def test_update_raise_on_overlap(self):
|
||||
df = DataFrame(
|
||||
[[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]]
|
||||
)
|
||||
|
||||
other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2])
|
||||
with pytest.raises(ValueError, match="Data overlaps"):
|
||||
df.update(other, errors="raise")
|
||||
|
||||
@pytest.mark.parametrize("raise_conflict", [True, False])
|
||||
def test_update_deprecation(self, raise_conflict):
|
||||
df = DataFrame([[1.5, 1, 3.0]])
|
||||
other = DataFrame()
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.update(other, raise_conflict=raise_conflict)
|
||||
|
||||
def test_update_from_non_df(self):
|
||||
d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])}
|
||||
df = DataFrame(d)
|
||||
|
||||
d["a"] = Series([5, 6, 7, 8])
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}
|
||||
df = DataFrame(d)
|
||||
|
||||
d["a"] = [5, 6, 7, 8]
|
||||
df.update(d)
|
||||
|
||||
expected = DataFrame(d)
|
||||
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_update_datetime_tz(self):
|
||||
# GH 25807
|
||||
result = DataFrame([pd.Timestamp("2019", tz="UTC")])
|
||||
result.update(result)
|
||||
expected = DataFrame([pd.Timestamp("2019", tz="UTC")])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_join_str_datetime(self):
|
||||
str_dates = ["20120209", "20120222"]
|
||||
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
|
||||
|
||||
A = DataFrame(str_dates, index=range(2), columns=["aa"])
|
||||
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
|
||||
|
||||
tst = A.join(C, on="aa")
|
||||
|
||||
assert len(tst.columns) == 3
|
||||
|
||||
def test_join_multiindex_leftright(self):
|
||||
# GH 10741
|
||||
df1 = pd.DataFrame(
|
||||
[
|
||||
["a", "x", 0.471780],
|
||||
["a", "y", 0.774908],
|
||||
["a", "z", 0.563634],
|
||||
["b", "x", -0.353756],
|
||||
["b", "y", 0.368062],
|
||||
["b", "z", -1.721840],
|
||||
["c", "x", 1],
|
||||
["c", "y", 2],
|
||||
["c", "z", 3],
|
||||
],
|
||||
columns=["first", "second", "value1"],
|
||||
).set_index(["first", "second"])
|
||||
|
||||
df2 = pd.DataFrame(
|
||||
[["a", 10], ["b", 20]], columns=["first", "value2"]
|
||||
).set_index(["first"])
|
||||
|
||||
exp = pd.DataFrame(
|
||||
[
|
||||
[0.471780, 10],
|
||||
[0.774908, 10],
|
||||
[0.563634, 10],
|
||||
[-0.353756, 20],
|
||||
[0.368062, 20],
|
||||
[-1.721840, 20],
|
||||
[1.000000, np.nan],
|
||||
[2.000000, np.nan],
|
||||
[3.000000, np.nan],
|
||||
],
|
||||
index=df1.index,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
# these must be the same results (but columns are flipped)
|
||||
assert_frame_equal(df1.join(df2, how="left"), exp)
|
||||
assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]])
|
||||
|
||||
exp_idx = pd.MultiIndex.from_product(
|
||||
[["a", "b"], ["x", "y", "z"]], names=["first", "second"]
|
||||
)
|
||||
exp = pd.DataFrame(
|
||||
[
|
||||
[0.471780, 10],
|
||||
[0.774908, 10],
|
||||
[0.563634, 10],
|
||||
[-0.353756, 20],
|
||||
[0.368062, 20],
|
||||
[-1.721840, 20],
|
||||
],
|
||||
index=exp_idx,
|
||||
columns=["value1", "value2"],
|
||||
)
|
||||
|
||||
assert_frame_equal(df1.join(df2, how="right"), exp)
|
||||
assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]])
|
||||
|
||||
def test_concat_named_keys(self):
|
||||
# GH 14252
|
||||
df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]})
|
||||
index = Index(["a", "b"], name="baz")
|
||||
concatted_named_from_keys = pd.concat([df, df], keys=index)
|
||||
expected_named = pd.DataFrame(
|
||||
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]),
|
||||
)
|
||||
assert_frame_equal(concatted_named_from_keys, expected_named)
|
||||
|
||||
index_no_name = Index(["a", "b"], name=None)
|
||||
concatted_named_from_names = pd.concat(
|
||||
[df, df], keys=index_no_name, names=["baz"]
|
||||
)
|
||||
assert_frame_equal(concatted_named_from_names, expected_named)
|
||||
|
||||
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
|
||||
expected_unnamed = pd.DataFrame(
|
||||
{"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]},
|
||||
index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]),
|
||||
)
|
||||
assert_frame_equal(concatted_unnamed, expected_unnamed)
|
||||
|
||||
def test_concat_axis_parameter(self):
|
||||
# GH 14369
|
||||
df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2))
|
||||
df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2))
|
||||
|
||||
# Index/row/0 DataFrame
|
||||
expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index = pd.concat([df1, df2], axis="index")
|
||||
assert_frame_equal(concatted_index, expected_index)
|
||||
|
||||
concatted_row = pd.concat([df1, df2], axis="rows")
|
||||
assert_frame_equal(concatted_row, expected_index)
|
||||
|
||||
concatted_0 = pd.concat([df1, df2], axis=0)
|
||||
assert_frame_equal(concatted_0, expected_index)
|
||||
|
||||
# Columns/1 DataFrame
|
||||
expected_columns = pd.DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"]
|
||||
)
|
||||
|
||||
concatted_columns = pd.concat([df1, df2], axis="columns")
|
||||
assert_frame_equal(concatted_columns, expected_columns)
|
||||
|
||||
concatted_1 = pd.concat([df1, df2], axis=1)
|
||||
assert_frame_equal(concatted_1, expected_columns)
|
||||
|
||||
series1 = pd.Series([0.1, 0.2])
|
||||
series2 = pd.Series([0.3, 0.4])
|
||||
|
||||
# Index/row/0 Series
|
||||
expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
|
||||
|
||||
concatted_index_series = pd.concat([series1, series2], axis="index")
|
||||
assert_series_equal(concatted_index_series, expected_index_series)
|
||||
|
||||
concatted_row_series = pd.concat([series1, series2], axis="rows")
|
||||
assert_series_equal(concatted_row_series, expected_index_series)
|
||||
|
||||
concatted_0_series = pd.concat([series1, series2], axis=0)
|
||||
assert_series_equal(concatted_0_series, expected_index_series)
|
||||
|
||||
# Columns/1 Series
|
||||
expected_columns_series = pd.DataFrame(
|
||||
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]
|
||||
)
|
||||
|
||||
concatted_columns_series = pd.concat([series1, series2], axis="columns")
|
||||
assert_frame_equal(concatted_columns_series, expected_columns_series)
|
||||
|
||||
concatted_1_series = pd.concat([series1, series2], axis=1)
|
||||
assert_frame_equal(concatted_1_series, expected_columns_series)
|
||||
|
||||
# Testing ValueError
|
||||
with pytest.raises(ValueError, match="No axis named"):
|
||||
pd.concat([series1, series2], axis="something")
|
||||
|
||||
def test_concat_numerical_names(self):
|
||||
# #15262 # #12223
|
||||
df = pd.DataFrame(
|
||||
{"col": range(9)},
|
||||
dtype="int32",
|
||||
index=(
|
||||
pd.MultiIndex.from_product(
|
||||
[["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2]
|
||||
)
|
||||
),
|
||||
)
|
||||
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
|
||||
expected = pd.DataFrame(
|
||||
{"col": [0, 1, 7, 8]},
|
||||
dtype="int32",
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_concat_astype_dup_col(self):
|
||||
# gh 23049
|
||||
df = pd.DataFrame([{"a": "b"}])
|
||||
df = pd.concat([df, df], axis=1)
|
||||
|
||||
result = df.astype("category")
|
||||
expected = pd.DataFrame(
|
||||
np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"]
|
||||
).astype("category")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameCombineFirst:
|
||||
def test_combine_first_mixed(self):
|
||||
a = Series(["a", "b"], index=range(2))
|
||||
b = Series(range(2), index=range(2))
|
||||
f = DataFrame({"A": a, "B": b})
|
||||
|
||||
a = Series(["a", "b"], index=range(5, 7))
|
||||
b = Series(range(2), index=range(5, 7))
|
||||
g = DataFrame({"A": a, "B": b})
|
||||
|
||||
exp = pd.DataFrame(
|
||||
{"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6]
|
||||
)
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_frame_equal(combined, exp)
|
||||
|
||||
def test_combine_first(self, float_frame):
|
||||
# disjoint
|
||||
head, tail = float_frame[:5], float_frame[5:]
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
reordered_frame = float_frame.reindex(combined.index)
|
||||
assert_frame_equal(combined, reordered_frame)
|
||||
assert tm.equalContents(combined.columns, float_frame.columns)
|
||||
assert_series_equal(combined["A"], reordered_frame["A"])
|
||||
|
||||
# same index
|
||||
fcopy = float_frame.copy()
|
||||
fcopy["A"] = 1
|
||||
del fcopy["C"]
|
||||
|
||||
fcopy2 = float_frame.copy()
|
||||
fcopy2["B"] = 0
|
||||
del fcopy2["D"]
|
||||
|
||||
combined = fcopy.combine_first(fcopy2)
|
||||
|
||||
assert (combined["A"] == 1).all()
|
||||
assert_series_equal(combined["B"], fcopy["B"])
|
||||
assert_series_equal(combined["C"], fcopy2["C"])
|
||||
assert_series_equal(combined["D"], fcopy["D"])
|
||||
|
||||
# overlap
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||||
head["A"] = 1
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
assert (combined["A"][:10] == 1).all()
|
||||
|
||||
# reverse overlap
|
||||
tail["A"][:10] = 0
|
||||
combined = tail.combine_first(head)
|
||||
assert (combined["A"][:10] == 0).all()
|
||||
|
||||
# no overlap
|
||||
f = float_frame[:10]
|
||||
g = float_frame[10:]
|
||||
combined = f.combine_first(g)
|
||||
assert_series_equal(combined["A"].reindex(f.index), f["A"])
|
||||
assert_series_equal(combined["A"].reindex(g.index), g["A"])
|
||||
|
||||
# corner cases
|
||||
comb = float_frame.combine_first(DataFrame())
|
||||
assert_frame_equal(comb, float_frame)
|
||||
|
||||
comb = DataFrame().combine_first(float_frame)
|
||||
assert_frame_equal(comb, float_frame)
|
||||
|
||||
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||||
assert "faz" in comb.index
|
||||
|
||||
# #2525
|
||||
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
|
||||
df2 = DataFrame(columns=["b"])
|
||||
result = df.combine_first(df2)
|
||||
assert "b" in result
|
||||
|
||||
def test_combine_first_mixed_bug(self):
|
||||
idx = Index(["a", "b", "c", "e"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "e"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
|
||||
|
||||
idx = Index(["a", "b", "c", "f"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "f"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
|
||||
|
||||
combined = frame1.combine_first(frame2)
|
||||
assert len(combined.columns) == 5
|
||||
|
||||
# gh 3016 (same as in update)
|
||||
df = DataFrame(
|
||||
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
|
||||
columns=["A", "B", "bool1", "bool2"],
|
||||
)
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
|
||||
result = df.combine_first(other)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df.loc[0, "A"] = np.nan
|
||||
result = df.combine_first(other)
|
||||
df.loc[0, "A"] = 45
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
# doc example
|
||||
df1 = DataFrame(
|
||||
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
|
||||
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
|
||||
}
|
||||
)
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH3552, return object dtype with bools
|
||||
df1 = DataFrame(
|
||||
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
|
||||
)
|
||||
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
|
||||
|
||||
result = df1.combine_first(df2)[2]
|
||||
expected = Series([True, True, False], name=2)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH 3593, converting datetime64[ns] incorrectly
|
||||
df0 = DataFrame(
|
||||
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
|
||||
)
|
||||
df1 = DataFrame({"a": [None, None, None]})
|
||||
df2 = df1.combine_first(df0)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
df2 = df0.combine_first(df1)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
df0 = DataFrame(
|
||||
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
|
||||
)
|
||||
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
|
||||
df2 = df1.combine_first(df0)
|
||||
result = df0.copy()
|
||||
result.iloc[0, :] = df1.iloc[0, :]
|
||||
assert_frame_equal(df2, result)
|
||||
|
||||
df2 = df0.combine_first(df1)
|
||||
assert_frame_equal(df2, df0)
|
||||
|
||||
def test_combine_first_align_nan(self):
|
||||
# GH 7509 (not fixed)
|
||||
dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
|
||||
dfb = pd.DataFrame([[4], [5]], columns=["b"])
|
||||
assert dfa["a"].dtype == "datetime64[ns]"
|
||||
assert dfa["b"].dtype == "int64"
|
||||
|
||||
res = dfa.combine_first(dfb)
|
||||
exp = pd.DataFrame(
|
||||
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]},
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["a"].dtype == "datetime64[ns]"
|
||||
# ToDo: this must be int64
|
||||
assert res["b"].dtype == "float64"
|
||||
|
||||
res = dfa.iloc[:0].combine_first(dfb)
|
||||
exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
# ToDo: this must be datetime64
|
||||
assert res["a"].dtype == "float64"
|
||||
# ToDo: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
def test_combine_first_timezone(self):
|
||||
# see gh-7630
|
||||
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC")
|
||||
df1 = pd.DataFrame(
|
||||
columns=["UTCdatetime", "abc"],
|
||||
data=data1,
|
||||
index=pd.date_range("20140627", periods=1),
|
||||
)
|
||||
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
|
||||
df2 = pd.DataFrame(
|
||||
columns=["UTCdatetime", "xyz"],
|
||||
data=data2,
|
||||
index=pd.date_range("20140628", periods=1),
|
||||
)
|
||||
res = df2[["UTCdatetime"]].combine_first(df1)
|
||||
exp = pd.DataFrame(
|
||||
{
|
||||
"UTCdatetime": [
|
||||
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
|
||||
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
|
||||
],
|
||||
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
|
||||
},
|
||||
columns=["UTCdatetime", "abc"],
|
||||
index=pd.date_range("20140627", periods=2, freq="D"),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
|
||||
assert res["abc"].dtype == "datetime64[ns, UTC]"
|
||||
|
||||
# see gh-10567
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
|
||||
df1 = pd.DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC")
|
||||
df2 = pd.DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == "datetime64[ns, UTC]"
|
||||
|
||||
dts1 = pd.DatetimeIndex(
|
||||
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
|
||||
)
|
||||
df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
|
||||
dts2 = pd.DatetimeIndex(
|
||||
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
|
||||
)
|
||||
df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.DatetimeIndex(
|
||||
[
|
||||
"2011-01-01",
|
||||
"2012-01-01",
|
||||
"NaT",
|
||||
"2012-01-02",
|
||||
"2011-01-03",
|
||||
"2011-01-04",
|
||||
],
|
||||
tz="US/Eastern",
|
||||
)
|
||||
exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# different tz
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
|
||||
df1 = pd.DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05")
|
||||
df2 = pd.DataFrame({"DATE": dts2})
|
||||
|
||||
# if df1 doesn't have NaN, keep its dtype
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
|
||||
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern")
|
||||
df1 = pd.DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-01", "2015-01-03")
|
||||
df2 = pd.DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Timestamp("2015-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-03"),
|
||||
]
|
||||
exp = pd.DataFrame({"DATE": exp_dts})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["DATE"].dtype == "object"
|
||||
|
||||
def test_combine_first_timedelta(self):
|
||||
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
|
||||
df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
|
||||
df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.TimedeltaIndex(
|
||||
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
|
||||
)
|
||||
exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["TD"].dtype == "timedelta64[ns]"
|
||||
|
||||
def test_combine_first_period(self):
|
||||
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
|
||||
df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
|
||||
df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.PeriodIndex(
|
||||
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
|
||||
)
|
||||
exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == data1.dtype
|
||||
|
||||
# different freq
|
||||
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
|
||||
df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2012-01-01", freq="D"),
|
||||
pd.NaT,
|
||||
pd.Period("2012-01-02", freq="D"),
|
||||
pd.Period("2011-03", freq="M"),
|
||||
pd.Period("2011-04", freq="M"),
|
||||
]
|
||||
exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == "object"
|
||||
|
||||
def test_combine_first_int(self):
|
||||
# GH14687 - integer series that do no align exactly
|
||||
|
||||
df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
|
||||
df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64")
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["a"].dtype == "int64"
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.0])
|
||||
def test_combine_first_with_asymmetric_other(self, val):
|
||||
# see gh-20699
|
||||
df1 = pd.DataFrame({"isNum": [val]})
|
||||
df2 = pd.DataFrame({"isBool": [True]})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp = pd.DataFrame({"isBool": [True], "isNum": [val]})
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_concat_datetime_datetime64_frame(self):
|
||||
# #2624
|
||||
rows = []
|
||||
rows.append([datetime(2010, 1, 1), 1])
|
||||
rows.append([datetime(2010, 1, 2), "hi"])
|
||||
|
||||
df2_obj = DataFrame.from_records(rows, columns=["date", "test"])
|
||||
|
||||
ind = date_range(start="2000/1/1", freq="D", periods=10)
|
||||
df1 = DataFrame({"date": ind, "test": range(10)})
|
||||
|
||||
# it works!
|
||||
pd.concat([df1, df2_obj])
|
||||
|
||||
|
||||
class TestDataFrameUpdate:
|
||||
def test_update_nan(self):
|
||||
# #15593 #15617
|
||||
# test 1
|
||||
df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
|
||||
df2 = DataFrame({"A": [None, 2, 3]})
|
||||
expected = df1.copy()
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
|
||||
# test 2
|
||||
df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)})
|
||||
df2 = DataFrame({"A": [None, 2, 3]})
|
||||
expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)})
|
||||
df1.update(df2, overwrite=False)
|
||||
|
||||
tm.assert_frame_equal(df1, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,615 @@
|
||||
from collections import OrderedDict, abc, defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestDataFrameConvertTo(TestData):
|
||||
def test_to_dict_timestamp(self):
|
||||
|
||||
# GH11247
|
||||
# split/records producing np.datetime64 rather than Timestamps
|
||||
# on datetime64[ns] dtypes only
|
||||
|
||||
tsmp = Timestamp("20130101")
|
||||
test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
|
||||
test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})
|
||||
|
||||
expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
|
||||
expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]
|
||||
|
||||
assert test_data.to_dict(orient="records") == expected_records
|
||||
assert test_data_mixed.to_dict(orient="records") == expected_records_mixed
|
||||
|
||||
expected_series = {
|
||||
"A": Series([tsmp, tsmp], name="A"),
|
||||
"B": Series([tsmp, tsmp], name="B"),
|
||||
}
|
||||
expected_series_mixed = {
|
||||
"A": Series([tsmp, tsmp], name="A"),
|
||||
"B": Series([1, 2], name="B"),
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series)
|
||||
tm.assert_dict_equal(
|
||||
test_data_mixed.to_dict(orient="series"), expected_series_mixed
|
||||
)
|
||||
|
||||
expected_split = {
|
||||
"index": [0, 1],
|
||||
"data": [[tsmp, tsmp], [tsmp, tsmp]],
|
||||
"columns": ["A", "B"],
|
||||
}
|
||||
expected_split_mixed = {
|
||||
"index": [0, 1],
|
||||
"data": [[tsmp, 1], [tsmp, 2]],
|
||||
"columns": ["A", "B"],
|
||||
}
|
||||
|
||||
tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
|
||||
tm.assert_dict_equal(
|
||||
test_data_mixed.to_dict(orient="split"), expected_split_mixed
|
||||
)
|
||||
|
||||
def test_to_dict_index_not_unique_with_index_orient(self):
|
||||
# GH22801
|
||||
# Data loss when indexes are not unique. Raise ValueError.
|
||||
df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
|
||||
msg = "DataFrame index must be unique for orient='index'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_dict(orient="index")
|
||||
|
||||
def test_to_dict_invalid_orient(self):
|
||||
df = DataFrame({"A": [0, 1]})
|
||||
msg = "orient 'xinvalid' not understood"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_dict(orient="xinvalid")
|
||||
|
||||
def test_to_records_dt64(self):
|
||||
df = DataFrame(
|
||||
[["one", "two", "three"], ["four", "five", "six"]],
|
||||
index=date_range("2012-01-01", "2012-01-02"),
|
||||
)
|
||||
|
||||
# convert_datetime64 defaults to None
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records()["index"][0]
|
||||
assert expected == result
|
||||
|
||||
# check for FutureWarning if convert_datetime64=False is passed
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = df.index.values[0]
|
||||
result = df.to_records(convert_datetime64=False)["index"][0]
|
||||
assert expected == result
|
||||
|
||||
# check for FutureWarning if convert_datetime64=True is passed
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
expected = df.index[0]
|
||||
result = df.to_records(convert_datetime64=True)["index"][0]
|
||||
assert expected == result
|
||||
|
||||
def test_to_records_with_multindex(self):
|
||||
# GH3189
|
||||
index = [
|
||||
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
|
||||
["one", "two", "one", "two", "one", "two", "one", "two"],
|
||||
]
|
||||
data = np.zeros((8, 4))
|
||||
df = DataFrame(data, index=index)
|
||||
r = df.to_records(index=True)["level_0"]
|
||||
assert "bar" in r
|
||||
assert "one" not in r
|
||||
|
||||
def test_to_records_with_Mapping_type(self):
|
||||
import email
|
||||
from email.parser import Parser
|
||||
|
||||
abc.Mapping.register(email.message.Message)
|
||||
|
||||
headers = Parser().parsestr(
|
||||
"From: <user@example.com>\n"
|
||||
"To: <someone_else@example.com>\n"
|
||||
"Subject: Test message\n"
|
||||
"\n"
|
||||
"Body would go here\n"
|
||||
)
|
||||
|
||||
frame = DataFrame.from_records([headers])
|
||||
all(x in frame for x in ["Type", "Subject", "From"])
|
||||
|
||||
def test_to_records_floats(self):
|
||||
df = DataFrame(np.random.rand(10, 10))
|
||||
df.to_records()
|
||||
|
||||
def test_to_records_index_name(self):
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
df.index.name = "X"
|
||||
rs = df.to_records()
|
||||
assert "X" in rs.dtype.fields
|
||||
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
rs = df.to_records()
|
||||
assert "index" in rs.dtype.fields
|
||||
|
||||
df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
|
||||
df.index.names = ["A", None]
|
||||
rs = df.to_records()
|
||||
assert "level_0" in rs.dtype.fields
|
||||
|
||||
def test_to_records_with_unicode_index(self):
|
||||
# GH13172
|
||||
# unicode_literals conflict with to_records
|
||||
result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
|
||||
expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_unicode_column_names(self):
|
||||
# xref issue: https://github.com/numpy/numpy/issues/2407
|
||||
# Issue #11879. to_records used to raise an exception when used
|
||||
# with column names containing non-ascii characters in Python 2
|
||||
result = DataFrame(data={"accented_name_é": [1.0]}).to_records()
|
||||
|
||||
# Note that numpy allows for unicode field names but dtypes need
|
||||
# to be specified using dictionary instead of list of tuples.
|
||||
expected = np.rec.array(
|
||||
[(0, 1.0)],
|
||||
dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]},
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_with_categorical(self):
|
||||
|
||||
# GH8626
|
||||
|
||||
# dict creation
|
||||
df = DataFrame({"A": list("abc")}, dtype="category")
|
||||
expected = Series(list("abc"), dtype="category", name="A")
|
||||
tm.assert_series_equal(df["A"], expected)
|
||||
|
||||
# list-like creation
|
||||
df = DataFrame(list("abc"), dtype="category")
|
||||
expected = Series(list("abc"), dtype="category", name=0)
|
||||
tm.assert_series_equal(df[0], expected)
|
||||
|
||||
# to record array
|
||||
# this coerces
|
||||
result = df.to_records()
|
||||
expected = np.rec.array(
|
||||
[(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
# No dtypes --> default to array dtypes.
|
||||
(
|
||||
dict(),
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Should have no effect in this case.
|
||||
(
|
||||
dict(index=True),
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Column dtype applied across the board. Index unaffected.
|
||||
(
|
||||
dict(column_dtypes="<U4"),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"), ("C", "<U4")],
|
||||
),
|
||||
),
|
||||
# Index dtype applied across the board. Columns unaffected.
|
||||
(
|
||||
dict(index_dtypes="<U1"),
|
||||
np.rec.array(
|
||||
[("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
|
||||
dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Pass in a type instance.
|
||||
(
|
||||
dict(column_dtypes=np.unicode),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")],
|
||||
),
|
||||
),
|
||||
# Pass in a dtype instance.
|
||||
(
|
||||
dict(column_dtypes=np.dtype("unicode")),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")],
|
||||
),
|
||||
),
|
||||
# Pass in a dictionary (name-only).
|
||||
(
|
||||
dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "<U2")],
|
||||
),
|
||||
),
|
||||
# Pass in a dictionary (indices-only).
|
||||
(
|
||||
dict(index_dtypes={0: "int16"}),
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Ignore index mappings if index is not True.
|
||||
(
|
||||
dict(index=False, index_dtypes="<U2"),
|
||||
np.rec.array(
|
||||
[(1, 0.2, "a"), (2, 1.5, "bc")],
|
||||
dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Non-existent names / indices in mapping should not error.
|
||||
(
|
||||
dict(index_dtypes={0: "int16", "not-there": "float32"}),
|
||||
np.rec.array(
|
||||
[(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
|
||||
dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Names / indices not in mapping default to array dtype.
|
||||
(
|
||||
dict(column_dtypes={"A": np.int8, "B": np.float32}),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Names / indices not in dtype mapping default to array dtype.
|
||||
(
|
||||
dict(column_dtypes={"A": np.dtype("int8"), "B": np.dtype("float32")}),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Mixture of everything.
|
||||
(
|
||||
dict(column_dtypes={"A": np.int8, "B": np.float32}, index_dtypes="<U2"),
|
||||
np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
|
||||
),
|
||||
),
|
||||
# Invalid dype values.
|
||||
(
|
||||
dict(index=False, column_dtypes=list()),
|
||||
(ValueError, "Invalid dtype \\[\\] specified for column A"),
|
||||
),
|
||||
(
|
||||
dict(index=False, column_dtypes={"A": "int32", "B": 5}),
|
||||
(ValueError, "Invalid dtype 5 specified for column B"),
|
||||
),
|
||||
# Numpy can't handle EA types, so check error is raised
|
||||
(
|
||||
dict(
|
||||
index=False,
|
||||
column_dtypes={"A": "int32", "B": CategoricalDtype(["a", "b"])},
|
||||
),
|
||||
(ValueError, "Invalid dtype category specified for column B"),
|
||||
),
|
||||
# Check that bad types raise
|
||||
(
|
||||
dict(index=False, column_dtypes={"A": "int32", "B": "foo"}),
|
||||
(TypeError, 'data type "foo" not understood'),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_records_dtype(self, kwargs, expected):
|
||||
# see gh-18146
|
||||
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
|
||||
|
||||
if not isinstance(expected, np.recarray):
|
||||
with pytest.raises(expected[0], match=expected[1]):
|
||||
df.to_records(**kwargs)
|
||||
else:
|
||||
result = df.to_records(**kwargs)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df,kwargs,expected",
|
||||
[
|
||||
# MultiIndex in the index.
|
||||
(
|
||||
DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")
|
||||
).set_index(["a", "b"]),
|
||||
dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
|
||||
np.rec.array(
|
||||
[(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
|
||||
dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
|
||||
),
|
||||
),
|
||||
# MultiIndex in the columns.
|
||||
(
|
||||
DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "d"), ("b", "e"), ("c", "f")]
|
||||
),
|
||||
),
|
||||
dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
|
||||
np.rec.array(
|
||||
[(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)],
|
||||
dtype=[
|
||||
("index", "<f4"),
|
||||
("('a', 'd')", "<U1"),
|
||||
("('b', 'e')", "<i8"),
|
||||
("('c', 'f')", "<f4"),
|
||||
],
|
||||
),
|
||||
),
|
||||
# MultiIndex in both the columns and index.
|
||||
(
|
||||
DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")
|
||||
),
|
||||
index=MultiIndex.from_tuples(
|
||||
[("d", -4), ("d", -5), ("f", -6)], names=list("cd")
|
||||
),
|
||||
),
|
||||
dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
|
||||
np.rec.array(
|
||||
[
|
||||
("d", -4, 1.0, 2.0, 3.0),
|
||||
("d", -5, 4.0, 5.0, 6.0),
|
||||
("f", -6, 7, 8, 9.0),
|
||||
],
|
||||
dtype=[
|
||||
("c", "<U2"),
|
||||
("d", "i1"),
|
||||
("('a', 'd')", "<f8"),
|
||||
("('b', 'e')", "<f8"),
|
||||
("('c', 'f')", "<f8"),
|
||||
],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_records_dtype_mi(self, df, kwargs, expected):
|
||||
# see gh-18146
|
||||
result = df.to_records(**kwargs)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
def test_to_records_dict_like(self):
|
||||
# see gh-18146
|
||||
class DictLike:
|
||||
def __init__(self, **kwargs):
|
||||
self.d = kwargs.copy()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.d.__getitem__(key)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.d
|
||||
|
||||
def keys(self):
|
||||
return self.d.keys()
|
||||
|
||||
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
|
||||
|
||||
dtype_mappings = dict(
|
||||
column_dtypes=DictLike(**{"A": np.int8, "B": np.float32}),
|
||||
index_dtypes="<U2",
|
||||
)
|
||||
|
||||
result = df.to_records(**dtype_mappings)
|
||||
expected = np.rec.array(
|
||||
[("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
|
||||
dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
|
||||
def test_to_dict(self, mapping):
|
||||
test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
|
||||
|
||||
# GH16122
|
||||
recons_data = DataFrame(test_data).to_dict(into=mapping)
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k][k2]
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("l", mapping)
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k][int(k2) - 1]
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("s", mapping)
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k][k2]
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("sp", mapping)
|
||||
expected_split = {
|
||||
"columns": ["A", "B"],
|
||||
"index": ["1", "2", "3"],
|
||||
"data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
|
||||
}
|
||||
tm.assert_dict_equal(recons_data, expected_split)
|
||||
|
||||
recons_data = DataFrame(test_data).to_dict("r", mapping)
|
||||
expected_records = [
|
||||
{"A": 1.0, "B": "1"},
|
||||
{"A": 2.0, "B": "2"},
|
||||
{"A": np.nan, "B": "3"},
|
||||
]
|
||||
assert isinstance(recons_data, list)
|
||||
assert len(recons_data) == 3
|
||||
for l, r in zip(recons_data, expected_records):
|
||||
tm.assert_dict_equal(l, r)
|
||||
|
||||
# GH10844
|
||||
recons_data = DataFrame(test_data).to_dict("i")
|
||||
|
||||
for k, v in test_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k2][k]
|
||||
|
||||
df = DataFrame(test_data)
|
||||
df["duped"] = df[df.columns[0]]
|
||||
recons_data = df.to_dict("i")
|
||||
comp_data = test_data.copy()
|
||||
comp_data["duped"] = comp_data[df.columns[0]]
|
||||
for k, v in comp_data.items():
|
||||
for k2, v2 in v.items():
|
||||
assert v2 == recons_data[k2][k]
|
||||
|
||||
@pytest.mark.parametrize("mapping", [list, defaultdict, []])
|
||||
def test_to_dict_errors(self, mapping):
|
||||
# GH16122
|
||||
df = DataFrame(np.random.randn(3, 3))
|
||||
with pytest.raises(TypeError):
|
||||
df.to_dict(into=mapping)
|
||||
|
||||
def test_to_dict_not_unique_warning(self):
|
||||
# GH16927: When converting to a dict, if a column has a non-unique name
|
||||
# it will be dropped, throwing a warning.
|
||||
df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
|
||||
with tm.assert_produces_warning(UserWarning):
|
||||
df.to_dict()
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
|
||||
def test_to_records_datetimeindex_with_tz(self, tz):
|
||||
# GH13937
|
||||
dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)
|
||||
|
||||
df = DataFrame({"datetime": dr}, index=dr)
|
||||
|
||||
expected = df.to_records()
|
||||
result = df.tz_convert("UTC").to_records()
|
||||
|
||||
# both converted to UTC, so they are equal
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# orient - orient argument to to_dict function
|
||||
# item_getter - function for extracting value from
|
||||
# the resulting dict using column name and index
|
||||
@pytest.mark.parametrize(
|
||||
"orient,item_getter",
|
||||
[
|
||||
("dict", lambda d, col, idx: d[col][idx]),
|
||||
("records", lambda d, col, idx: d[idx][col]),
|
||||
("list", lambda d, col, idx: d[col][idx]),
|
||||
("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
|
||||
("index", lambda d, col, idx: d[idx][col]),
|
||||
],
|
||||
)
|
||||
def test_to_dict_box_scalars(self, orient, item_getter):
|
||||
# 14216, 23753
|
||||
# make sure that we are boxing properly
|
||||
df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
|
||||
result = df.to_dict(orient=orient)
|
||||
assert isinstance(item_getter(result, "a", 0), int)
|
||||
assert isinstance(item_getter(result, "b", 0), float)
|
||||
|
||||
def test_frame_to_dict_tz(self):
|
||||
# GH18372 When converting to dict with orient='records' columns of
|
||||
# datetime that are tz-aware were not converted to required arrays
|
||||
data = [
|
||||
(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
|
||||
(datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),),
|
||||
]
|
||||
df = DataFrame(list(data), columns=["d"])
|
||||
|
||||
result = df.to_dict(orient="records")
|
||||
expected = [
|
||||
{"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)},
|
||||
{"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)},
|
||||
]
|
||||
tm.assert_dict_equal(result[0], expected[0])
|
||||
tm.assert_dict_equal(result[1], expected[1])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"into, expected",
|
||||
[
|
||||
(
|
||||
dict,
|
||||
{
|
||||
0: {"int_col": 1, "float_col": 1.0},
|
||||
1: {"int_col": 2, "float_col": 2.0},
|
||||
2: {"int_col": 3, "float_col": 3.0},
|
||||
},
|
||||
),
|
||||
(
|
||||
OrderedDict,
|
||||
OrderedDict(
|
||||
[
|
||||
(0, {"int_col": 1, "float_col": 1.0}),
|
||||
(1, {"int_col": 2, "float_col": 2.0}),
|
||||
(2, {"int_col": 3, "float_col": 3.0}),
|
||||
]
|
||||
),
|
||||
),
|
||||
(
|
||||
defaultdict(list),
|
||||
defaultdict(
|
||||
list,
|
||||
{
|
||||
0: {"int_col": 1, "float_col": 1.0},
|
||||
1: {"int_col": 2, "float_col": 2.0},
|
||||
2: {"int_col": 3, "float_col": 3.0},
|
||||
},
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_dict_index_dtypes(self, into, expected):
|
||||
# GH 18580
|
||||
# When using to_dict(orient='index') on a dataframe with int
|
||||
# and float columns only the int columns were cast to float
|
||||
|
||||
df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})
|
||||
|
||||
result = df.to_dict(orient="index", into=into)
|
||||
cols = ["int_col", "float_col"]
|
||||
result = DataFrame.from_dict(result, orient="index")[cols]
|
||||
expected = DataFrame.from_dict(expected, orient="index")[cols]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_to_dict_numeric_names(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/24940
|
||||
df = DataFrame({str(i): [i] for i in range(5)})
|
||||
result = set(df.to_dict("records")[0].keys())
|
||||
expected = set(df.columns)
|
||||
assert result == expected
|
||||
|
||||
def test_to_dict_wide(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/24939
|
||||
df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)})
|
||||
result = df.to_dict("records")[0]
|
||||
expected = {"A_{:d}".format(i): i for i in range(256)}
|
||||
assert result == expected
|
||||
1202
venv/lib/python3.6/site-packages/pandas/tests/frame/test_dtypes.py
Normal file
1202
venv/lib/python3.6/site-packages/pandas/tests/frame/test_dtypes.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,479 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
|
||||
def test_duplicated_with_misspelled_column_name(subset):
|
||||
# GH 19730
|
||||
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
|
||||
msg = re.escape("Index(['a'], dtype='object')")
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.duplicated(subset)
|
||||
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.drop_duplicates(subset)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_duplicated_do_not_fail_on_wide_dataframes():
|
||||
# gh-21524
|
||||
# Given the wide dataframe with a lot of columns
|
||||
# with different (important!) values
|
||||
data = {
|
||||
"col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100)
|
||||
}
|
||||
df = DataFrame(data).T
|
||||
result = df.duplicated()
|
||||
|
||||
# Then duplicates produce the bool Series as a result and don't fail during
|
||||
# calculation. Actual values doesn't matter here, though usually it's all
|
||||
# False in this case
|
||||
assert isinstance(result, Series)
|
||||
assert result.dtype == np.bool
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"keep, expected",
|
||||
[
|
||||
("first", Series([False, False, True, False, True])),
|
||||
("last", Series([True, True, False, False, False])),
|
||||
(False, Series([True, True, True, False, True])),
|
||||
],
|
||||
)
|
||||
def test_duplicated_keep(keep, expected):
|
||||
df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]})
|
||||
|
||||
result = df.duplicated(keep=keep)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal")
|
||||
@pytest.mark.parametrize(
|
||||
"keep, expected",
|
||||
[
|
||||
("first", Series([False, False, True, False, True])),
|
||||
("last", Series([True, True, False, False, False])),
|
||||
(False, Series([True, True, True, False, True])),
|
||||
],
|
||||
)
|
||||
def test_duplicated_nan_none(keep, expected):
|
||||
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object)
|
||||
|
||||
result = df.duplicated(keep=keep)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep", ["first", "last", False])
|
||||
@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"])
|
||||
def test_duplicated_subset(subset, keep):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [0, 1, 1, 2, 0],
|
||||
"B": ["a", "b", "b", "c", "a"],
|
||||
"C": [np.nan, 3, 3, None, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
if subset is None:
|
||||
subset = list(df.columns)
|
||||
elif isinstance(subset, str):
|
||||
# need to have a DataFrame, not a Series
|
||||
# -> select columns with singleton list, not string
|
||||
subset = [subset]
|
||||
|
||||
expected = df[subset].duplicated(keep=keep)
|
||||
result = df.duplicated(keep=keep, subset=subset)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates():
|
||||
df = DataFrame(
|
||||
{
|
||||
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("AAA")
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep="last")
|
||||
expected = df.loc[[6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep=False)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
expected = df.loc[[0, 1, 2, 3]]
|
||||
result = df.drop_duplicates(np.array(["AAA", "B"]))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates(["AAA", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AAA", "B"), keep="last")
|
||||
expected = df.loc[[0, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AAA", "B"), keep=False)
|
||||
expected = df.loc[[0]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# consider everything
|
||||
df2 = df.loc[:, ["AAA", "B", "C"]]
|
||||
|
||||
result = df2.drop_duplicates()
|
||||
# in this case only
|
||||
expected = df2.drop_duplicates(["AAA", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df2.drop_duplicates(keep="last")
|
||||
expected = df2.drop_duplicates(["AAA", "B"], keep="last")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df2.drop_duplicates(keep=False)
|
||||
expected = df2.drop_duplicates(["AAA", "B"], keep=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# integers
|
||||
result = df.drop_duplicates("C")
|
||||
expected = df.iloc[[0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates("C", keep="last")
|
||||
expected = df.iloc[[-2, -1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df["E"] = df["C"].astype("int8")
|
||||
result = df.drop_duplicates("E")
|
||||
expected = df.iloc[[0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.drop_duplicates("E", keep="last")
|
||||
expected = df.iloc[[-2, -1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH 11376
|
||||
df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
|
||||
expected = df.loc[df.index != 3]
|
||||
tm.assert_frame_equal(df.drop_duplicates(), expected)
|
||||
|
||||
df = DataFrame([[1, 0], [0, 2]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
df = DataFrame([[-2, 0], [0, -4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
x = np.iinfo(np.int64).max / 3 * 2
|
||||
df = DataFrame([[-x, x], [0, x + 4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
df = DataFrame([[-x, x], [x, x + 4]])
|
||||
tm.assert_frame_equal(df.drop_duplicates(), df)
|
||||
|
||||
# GH 11864
|
||||
df = DataFrame([i] * 9 for i in range(16))
|
||||
df = df.append([[1] + [0] * 8], ignore_index=True)
|
||||
|
||||
for keep in ["first", "last", False]:
|
||||
assert df.duplicated(keep=keep).sum() == 0
|
||||
|
||||
|
||||
def test_duplicated_on_empty_frame():
|
||||
# GH 25184
|
||||
|
||||
df = DataFrame(columns=["a", "b"])
|
||||
dupes = df.duplicated("a")
|
||||
|
||||
result = df[dupes]
|
||||
expected = df.copy()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_with_duplicate_column_names():
|
||||
# GH17836
|
||||
df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
|
||||
|
||||
result0 = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result0, df)
|
||||
|
||||
result1 = df.drop_duplicates("a")
|
||||
expected1 = df[:2]
|
||||
tm.assert_frame_equal(result1, expected1)
|
||||
|
||||
|
||||
def test_drop_duplicates_for_take_all():
|
||||
df = DataFrame(
|
||||
{
|
||||
"AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("AAA")
|
||||
expected = df.iloc[[0, 1, 2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep="last")
|
||||
expected = df.iloc[[2, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("AAA", keep=False)
|
||||
expected = df.iloc[[2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multiple columns
|
||||
result = df.drop_duplicates(["AAA", "B"])
|
||||
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["AAA", "B"], keep="last")
|
||||
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["AAA", "B"], keep=False)
|
||||
expected = df.iloc[[0, 1, 2, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_tuple():
|
||||
df = DataFrame(
|
||||
{
|
||||
("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates(("AA", "AB"))
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AA", "AB"), keep="last")
|
||||
expected = df.loc[[6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(("AA", "AB"), keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
assert len(result) == 0
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# multi column
|
||||
expected = df.loc[[0, 1, 2, 3]]
|
||||
result = df.drop_duplicates((("AA", "AB"), "B"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df",
|
||||
[
|
||||
DataFrame(),
|
||||
DataFrame(columns=[]),
|
||||
DataFrame(columns=["A", "B", "C"]),
|
||||
DataFrame(index=[]),
|
||||
DataFrame(index=["A", "B", "C"]),
|
||||
],
|
||||
)
|
||||
def test_drop_duplicates_empty(df):
|
||||
# GH 20516
|
||||
result = df.drop_duplicates()
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = df.copy()
|
||||
result.drop_duplicates(inplace=True)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_drop_duplicates_NA():
|
||||
# none
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("A")
|
||||
expected = df.loc[[0, 2, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep="last")
|
||||
expected = df.loc[[1, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
result = df.drop_duplicates(["A", "B"])
|
||||
expected = df.loc[[0, 2, 3, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["A", "B"], keep="last")
|
||||
expected = df.loc[[1, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["A", "B"], keep=False)
|
||||
expected = df.loc[[6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nan
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
result = df.drop_duplicates("C")
|
||||
expected = df[:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep="last")
|
||||
expected = df.loc[[3, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep=False)
|
||||
expected = df.loc[[]] # empty df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(result) == 0
|
||||
|
||||
# multi column
|
||||
result = df.drop_duplicates(["C", "B"])
|
||||
expected = df.loc[[0, 1, 2, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["C", "B"], keep="last")
|
||||
expected = df.loc[[1, 3, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates(["C", "B"], keep=False)
|
||||
expected = df.loc[[1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_NA_for_take_all():
|
||||
# none
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
|
||||
"C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
|
||||
}
|
||||
)
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates("A")
|
||||
expected = df.iloc[[0, 2, 3, 5, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep="last")
|
||||
expected = df.iloc[[1, 4, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("A", keep=False)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# nan
|
||||
|
||||
# single column
|
||||
result = df.drop_duplicates("C")
|
||||
expected = df.iloc[[0, 1, 5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep="last")
|
||||
expected = df.iloc[[3, 5, 6, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.drop_duplicates("C", keep=False)
|
||||
expected = df.iloc[[5, 6]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_drop_duplicates_inplace():
|
||||
orig = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": [1, 1, 2, 2, 2, 2, 1, 2],
|
||||
"D": range(8),
|
||||
}
|
||||
)
|
||||
# single column
|
||||
df = orig.copy()
|
||||
df.drop_duplicates("A", inplace=True)
|
||||
expected = orig[:2]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates("A", keep="last", inplace=True)
|
||||
expected = orig.loc[[6, 7]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates("A", keep=False, inplace=True)
|
||||
expected = orig.loc[[]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert len(df) == 0
|
||||
|
||||
# multi column
|
||||
df = orig.copy()
|
||||
df.drop_duplicates(["A", "B"], inplace=True)
|
||||
expected = orig.loc[[0, 1, 2, 3]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates(["A", "B"], keep="last", inplace=True)
|
||||
expected = orig.loc[[0, 5, 6, 7]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = orig.copy()
|
||||
df.drop_duplicates(["A", "B"], keep=False, inplace=True)
|
||||
expected = orig.loc[[0]]
|
||||
result = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# consider everything
|
||||
orig2 = orig.loc[:, ["A", "B", "C"]].copy()
|
||||
|
||||
df2 = orig2.copy()
|
||||
df2.drop_duplicates(inplace=True)
|
||||
# in this case only
|
||||
expected = orig2.drop_duplicates(["A", "B"])
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df2 = orig2.copy()
|
||||
df2.drop_duplicates(keep="last", inplace=True)
|
||||
expected = orig2.drop_duplicates(["A", "B"], keep="last")
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df2 = orig2.copy()
|
||||
df2.drop_duplicates(keep=False, inplace=True)
|
||||
expected = orig2.drop_duplicates(["A", "B"], keep=False)
|
||||
result = df2
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,120 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas.util import testing as tm
|
||||
|
||||
|
||||
def test_error():
|
||||
df = pd.DataFrame(
|
||||
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
df.explode(list("AA"))
|
||||
|
||||
df.columns = list("AA")
|
||||
with pytest.raises(ValueError):
|
||||
df.explode("A")
|
||||
|
||||
|
||||
def test_basic():
|
||||
df = pd.DataFrame(
|
||||
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
|
||||
)
|
||||
result = df.explode("A")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
|
||||
),
|
||||
"B": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_rows():
|
||||
df = pd.DataFrame(
|
||||
{"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
|
||||
index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
|
||||
)
|
||||
|
||||
result = df.explode("A")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4],
|
||||
index=pd.MultiIndex.from_tuples(
|
||||
[
|
||||
("a", 1),
|
||||
("a", 1),
|
||||
("a", 1),
|
||||
("a", 2),
|
||||
("b", 1),
|
||||
("b", 2),
|
||||
("b", 2),
|
||||
]
|
||||
),
|
||||
dtype=object,
|
||||
),
|
||||
"B": 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_columns():
|
||||
df = pd.DataFrame(
|
||||
{("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
|
||||
)
|
||||
|
||||
result = df.explode(("A", 1))
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("A", 1): pd.Series(
|
||||
[0, 1, 2, np.nan, np.nan, 3, 4],
|
||||
index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
|
||||
dtype=object,
|
||||
),
|
||||
("A", 2): 1,
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecase():
|
||||
# explode a single column
|
||||
# gh-10511
|
||||
df = pd.DataFrame(
|
||||
[[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
|
||||
).set_index("C")
|
||||
result = df.explode("B")
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [11, 11, 11, 11, 11, 22, 22, 22],
|
||||
"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
|
||||
"C": [10, 10, 10, 10, 10, 20, 20, 20],
|
||||
},
|
||||
columns=list("ABC"),
|
||||
).set_index("C")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# gh-8517
|
||||
df = pd.DataFrame(
|
||||
[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
|
||||
columns=["dt", "name", "text"],
|
||||
)
|
||||
result = df.assign(text=df.text.str.split(" ")).explode("text")
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["2014-01-01", "Alice", "A"],
|
||||
["2014-01-01", "Alice", "B"],
|
||||
["2014-01-02", "Bob", "C"],
|
||||
["2014-01-02", "Bob", "D"],
|
||||
],
|
||||
columns=["dt", "name", "text"],
|
||||
index=[0, 0, 1, 1],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
3867
venv/lib/python3.6/site-packages/pandas/tests/frame/test_indexing.py
Normal file
3867
venv/lib/python3.6/site-packages/pandas/tests/frame/test_indexing.py
Normal file
File diff suppressed because it is too large
Load Diff
195
venv/lib/python3.6/site-packages/pandas/tests/frame/test_join.py
Normal file
195
venv/lib/python3.6/site-packages/pandas/tests/frame/test_join.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, period_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame_with_period_index():
|
||||
return DataFrame(
|
||||
data=np.arange(20).reshape(4, 5),
|
||||
columns=list("abcde"),
|
||||
index=period_range(start="2000", freq="A", periods=4),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def left():
|
||||
return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def right():
|
||||
return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"how, sort, expected",
|
||||
[
|
||||
("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
|
||||
("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
|
||||
(
|
||||
"left",
|
||||
False,
|
||||
DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
|
||||
),
|
||||
(
|
||||
"left",
|
||||
True,
|
||||
DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
|
||||
),
|
||||
(
|
||||
"right",
|
||||
False,
|
||||
DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]),
|
||||
),
|
||||
(
|
||||
"right",
|
||||
True,
|
||||
DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]),
|
||||
),
|
||||
(
|
||||
"outer",
|
||||
False,
|
||||
DataFrame(
|
||||
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3],
|
||||
),
|
||||
),
|
||||
(
|
||||
"outer",
|
||||
True,
|
||||
DataFrame(
|
||||
{"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
|
||||
index=[0, 1, 2, 3],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_join(left, right, how, sort, expected):
|
||||
|
||||
result = left.join(right, how=how, sort=sort)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_join_index(float_frame):
|
||||
# left / right
|
||||
|
||||
f = float_frame.loc[float_frame.index[:10], ["A", "B"]]
|
||||
f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1]
|
||||
|
||||
joined = f.join(f2)
|
||||
tm.assert_index_equal(f.index, joined.index)
|
||||
expected_columns = Index(["A", "B", "C", "D"])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how="left")
|
||||
tm.assert_index_equal(joined.index, f.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
joined = f.join(f2, how="right")
|
||||
tm.assert_index_equal(joined.index, f2.index)
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# inner
|
||||
|
||||
joined = f.join(f2, how="inner")
|
||||
tm.assert_index_equal(joined.index, f.index[5:10])
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
# outer
|
||||
|
||||
joined = f.join(f2, how="outer")
|
||||
tm.assert_index_equal(joined.index, float_frame.index.sort_values())
|
||||
tm.assert_index_equal(joined.columns, expected_columns)
|
||||
|
||||
with pytest.raises(ValueError, match="join method"):
|
||||
f.join(f2, how="foo")
|
||||
|
||||
# corner case - overlapping columns
|
||||
msg = "columns overlap but no suffix"
|
||||
for how in ("outer", "left", "inner"):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.join(float_frame, how=how)
|
||||
|
||||
|
||||
def test_join_index_more(float_frame):
|
||||
af = float_frame.loc[:, ["A", "B"]]
|
||||
bf = float_frame.loc[::2, ["C", "D"]]
|
||||
|
||||
expected = af.copy()
|
||||
expected["C"] = float_frame["C"][::2]
|
||||
expected["D"] = float_frame["D"][::2]
|
||||
|
||||
result = af.join(bf)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = af.join(bf, how="right")
|
||||
tm.assert_frame_equal(result, expected[::2])
|
||||
|
||||
result = bf.join(af, how="right")
|
||||
tm.assert_frame_equal(result, expected.loc[:, result.columns])
|
||||
|
||||
|
||||
def test_join_index_series(float_frame):
|
||||
df = float_frame.copy()
|
||||
s = df.pop(float_frame.columns[-1])
|
||||
joined = df.join(s)
|
||||
|
||||
# TODO should this check_names ?
|
||||
tm.assert_frame_equal(joined, float_frame, check_names=False)
|
||||
|
||||
s.name = None
|
||||
with pytest.raises(ValueError, match="must have a name"):
|
||||
df.join(s)
|
||||
|
||||
|
||||
def test_join_overlap(float_frame):
|
||||
df1 = float_frame.loc[:, ["A", "B", "C"]]
|
||||
df2 = float_frame.loc[:, ["B", "C", "D"]]
|
||||
|
||||
joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2")
|
||||
df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1")
|
||||
df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2")
|
||||
|
||||
no_overlap = float_frame.loc[:, ["A", "D"]]
|
||||
expected = df1_suf.join(df2_suf).join(no_overlap)
|
||||
|
||||
# column order not necessarily sorted
|
||||
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
|
||||
|
||||
|
||||
def test_join_period_index(frame_with_period_index):
|
||||
other = frame_with_period_index.rename(columns=lambda x: "{key}{key}".format(key=x))
|
||||
|
||||
joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1)
|
||||
|
||||
joined_cols = frame_with_period_index.columns.append(other.columns)
|
||||
|
||||
joined = frame_with_period_index.join(other)
|
||||
expected = DataFrame(
|
||||
data=joined_values, columns=joined_cols, index=frame_with_period_index.index
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
|
||||
|
||||
def test_join_left_sequence_non_unique_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/19607
|
||||
df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3])
|
||||
df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2])
|
||||
df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4])
|
||||
|
||||
joined = df1.join([df2, df3], how="left")
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [0, 10, 10, 20],
|
||||
"b": [np.nan, 300, 300, 200],
|
||||
"c": [np.nan, 400, 500, np.nan],
|
||||
},
|
||||
index=[1, 2, 2, 3],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(joined, expected)
|
||||
@@ -0,0 +1,972 @@
|
||||
import datetime
|
||||
|
||||
import dateutil
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, DataFrame, Series, Timestamp, date_range
|
||||
from pandas.tests.frame.common import _check_mixed_float
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
def _skip_if_no_pchip():
|
||||
try:
|
||||
from scipy.interpolate import pchip_interpolate # noqa
|
||||
except ImportError:
|
||||
import pytest
|
||||
|
||||
pytest.skip("scipy.interpolate.pchip missing")
|
||||
|
||||
|
||||
class TestDataFrameMissingData:
|
||||
def test_dropEmptyRows(self, float_frame):
|
||||
N = len(float_frame.index)
|
||||
mat = np.random.randn(N)
|
||||
mat[:5] = np.nan
|
||||
|
||||
frame = DataFrame({"foo": mat}, index=float_frame.index)
|
||||
original = Series(mat, index=float_frame.index, name="foo")
|
||||
expected = original.dropna()
|
||||
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna(how="all")
|
||||
# check that original was preserved
|
||||
assert_series_equal(frame["foo"], original)
|
||||
inplace_frame1.dropna(how="all", inplace=True)
|
||||
assert_series_equal(smaller_frame["foo"], expected)
|
||||
assert_series_equal(inplace_frame1["foo"], expected)
|
||||
|
||||
smaller_frame = frame.dropna(how="all", subset=["foo"])
|
||||
inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
|
||||
assert_series_equal(smaller_frame["foo"], expected)
|
||||
assert_series_equal(inplace_frame2["foo"], expected)
|
||||
|
||||
def test_dropIncompleteRows(self, float_frame):
|
||||
N = len(float_frame.index)
|
||||
mat = np.random.randn(N)
|
||||
mat[:5] = np.nan
|
||||
|
||||
frame = DataFrame({"foo": mat}, index=float_frame.index)
|
||||
frame["bar"] = 5
|
||||
original = Series(mat, index=float_frame.index, name="foo")
|
||||
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
|
||||
|
||||
smaller_frame = frame.dropna()
|
||||
assert_series_equal(frame["foo"], original)
|
||||
inp_frame1.dropna(inplace=True)
|
||||
|
||||
exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
|
||||
tm.assert_series_equal(smaller_frame["foo"], exp)
|
||||
tm.assert_series_equal(inp_frame1["foo"], exp)
|
||||
|
||||
samesize_frame = frame.dropna(subset=["bar"])
|
||||
assert_series_equal(frame["foo"], original)
|
||||
assert (frame["bar"] == 5).all()
|
||||
inp_frame2.dropna(subset=["bar"], inplace=True)
|
||||
tm.assert_index_equal(samesize_frame.index, float_frame.index)
|
||||
tm.assert_index_equal(inp_frame2.index, float_frame.index)
|
||||
|
||||
def test_dropna(self):
|
||||
df = DataFrame(np.random.randn(6, 4))
|
||||
df[2][:2] = np.nan
|
||||
|
||||
dropped = df.dropna(axis=1)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=1, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=0)
|
||||
expected = df.loc[list(range(2, 6))]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
# threshold
|
||||
dropped = df.dropna(axis=1, thresh=5)
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=1, thresh=5, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=0, thresh=4)
|
||||
expected = df.loc[range(2, 6)]
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, thresh=4, inplace=True)
|
||||
assert_frame_equal(dropped, expected)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=4)
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
dropped = df.dropna(axis=1, thresh=3)
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
# subset
|
||||
dropped = df.dropna(axis=0, subset=[0, 1, 3])
|
||||
inp = df.copy()
|
||||
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
|
||||
assert_frame_equal(dropped, df)
|
||||
assert_frame_equal(inp, df)
|
||||
|
||||
# all
|
||||
dropped = df.dropna(axis=1, how="all")
|
||||
assert_frame_equal(dropped, df)
|
||||
|
||||
df[2] = np.nan
|
||||
dropped = df.dropna(axis=1, how="all")
|
||||
expected = df.loc[:, [0, 1, 3]]
|
||||
assert_frame_equal(dropped, expected)
|
||||
|
||||
# bad input
|
||||
msg = "No axis named 3 for object type <class 'pandas.core.frame.DataFrame'>"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.dropna(axis=3)
|
||||
|
||||
def test_drop_and_dropna_caching(self):
|
||||
# tst that cacher updates
|
||||
original = Series([1, 2, np.nan], name="A")
|
||||
expected = Series([1, 2], dtype=original.dtype, name="A")
|
||||
df = pd.DataFrame({"A": original.values.copy()})
|
||||
df2 = df.copy()
|
||||
df["A"].dropna()
|
||||
assert_series_equal(df["A"], original)
|
||||
df["A"].dropna(inplace=True)
|
||||
assert_series_equal(df["A"], expected)
|
||||
df2["A"].drop([1])
|
||||
assert_series_equal(df2["A"], original)
|
||||
df2["A"].drop([1], inplace=True)
|
||||
assert_series_equal(df2["A"], original.drop([1]))
|
||||
|
||||
def test_dropna_corner(self, float_frame):
|
||||
# bad input
|
||||
msg = "invalid how option: foo"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.dropna(how="foo")
|
||||
msg = "must specify how or thresh"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
float_frame.dropna(how=None)
|
||||
# non-existent column - 8303
|
||||
with pytest.raises(KeyError, match=r"^\['X'\]$"):
|
||||
float_frame.dropna(subset=["A", "X"])
|
||||
|
||||
def test_dropna_multiple_axes(self):
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.nan, 2, 3],
|
||||
[4, np.nan, 5, 6],
|
||||
[np.nan, np.nan, np.nan, np.nan],
|
||||
[7, np.nan, 8, 9],
|
||||
]
|
||||
)
|
||||
cp = df.copy()
|
||||
|
||||
# GH20987
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result = df.dropna(how="all", axis=[0, 1])
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
result2 = df.dropna(how="all", axis=(0, 1))
|
||||
expected = df.dropna(how="all").dropna(how="all", axis=1)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
assert_frame_equal(result2, expected)
|
||||
assert_frame_equal(df, cp)
|
||||
|
||||
inp = df.copy()
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
inp.dropna(how="all", axis=(0, 1), inplace=True)
|
||||
assert_frame_equal(inp, expected)
|
||||
|
||||
def test_dropna_tz_aware_datetime(self):
|
||||
# GH13407
|
||||
df = DataFrame()
|
||||
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
|
||||
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
|
||||
df["Time"] = [dt1]
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame({"Time": [dt1]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Ex2
|
||||
df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
|
||||
result = df.dropna(axis=0)
|
||||
expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_dropna_categorical_interval_index(self):
|
||||
# GH 25087
|
||||
ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
|
||||
ci = pd.CategoricalIndex(ii)
|
||||
df = pd.DataFrame({"A": list("abc")}, index=ci)
|
||||
|
||||
expected = df
|
||||
result = df.dropna()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datetime(self, datetime_frame):
|
||||
tf = datetime_frame
|
||||
tf.loc[tf.index[:5], "A"] = np.nan
|
||||
tf.loc[tf.index[-5:], "A"] = np.nan
|
||||
|
||||
zero_filled = datetime_frame.fillna(0)
|
||||
assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all()
|
||||
|
||||
padded = datetime_frame.fillna(method="pad")
|
||||
assert np.isnan(padded.loc[padded.index[:5], "A"]).all()
|
||||
assert (
|
||||
padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"]
|
||||
).all()
|
||||
|
||||
msg = "Must specify a fill 'value' or 'method'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
datetime_frame.fillna()
|
||||
msg = "Cannot specify both 'value' and 'method'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
datetime_frame.fillna(5, method="ffill")
|
||||
|
||||
def test_fillna_mixed_type(self, float_string_frame):
|
||||
|
||||
mf = float_string_frame
|
||||
mf.loc[mf.index[5:20], "foo"] = np.nan
|
||||
mf.loc[mf.index[-10:], "A"] = np.nan
|
||||
# TODO: make stronger assertion here, GH 25640
|
||||
mf.fillna(value=0)
|
||||
mf.fillna(method="pad")
|
||||
|
||||
def test_fillna_mixed_float(self, mixed_float_frame):
|
||||
|
||||
# mixed numeric (but no float16)
|
||||
mf = mixed_float_frame.reindex(columns=["A", "B", "D"])
|
||||
mf.loc[mf.index[-10:], "A"] = np.nan
|
||||
result = mf.fillna(value=0)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
result = mf.fillna(method="pad")
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
def test_fillna_empty(self):
|
||||
# empty frame (GH #2778)
|
||||
df = DataFrame(columns=["x"])
|
||||
for m in ["pad", "backfill"]:
|
||||
df.x.fillna(method=m, inplace=True)
|
||||
df.x.fillna(method=m)
|
||||
|
||||
def test_fillna_different_dtype(self):
|
||||
# with different dtype (GH#3386)
|
||||
df = DataFrame(
|
||||
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
|
||||
)
|
||||
|
||||
result = df.fillna({2: "foo"})
|
||||
expected = DataFrame(
|
||||
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.fillna({2: "foo"}, inplace=True)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_fillna_limit_and_value(self):
|
||||
# limit and value
|
||||
df = DataFrame(np.random.randn(10, 3))
|
||||
df.iloc[2:7, 0] = np.nan
|
||||
df.iloc[3:5, 2] = np.nan
|
||||
|
||||
expected = df.copy()
|
||||
expected.iloc[2, 0] = 999
|
||||
expected.iloc[3, 2] = 999
|
||||
result = df.fillna(999, limit=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datelike(self):
|
||||
# with datelike
|
||||
# GH#6344
|
||||
df = DataFrame(
|
||||
{
|
||||
"Date": [pd.NaT, Timestamp("2014-1-1")],
|
||||
"Date2": [Timestamp("2013-1-1"), pd.NaT],
|
||||
}
|
||||
)
|
||||
|
||||
expected = df.copy()
|
||||
expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"])
|
||||
result = df.fillna(value={"Date": df["Date2"]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_tzaware(self):
|
||||
# with timezone
|
||||
# GH#15855
|
||||
df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]})
|
||||
exp = pd.DataFrame(
|
||||
{
|
||||
"A": [
|
||||
pd.Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
pd.Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
]
|
||||
}
|
||||
)
|
||||
assert_frame_equal(df.fillna(method="pad"), exp)
|
||||
|
||||
df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]})
|
||||
exp = pd.DataFrame(
|
||||
{
|
||||
"A": [
|
||||
pd.Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
pd.Timestamp("2012-11-11 00:00:00+01:00"),
|
||||
]
|
||||
}
|
||||
)
|
||||
assert_frame_equal(df.fillna(method="bfill"), exp)
|
||||
|
||||
def test_fillna_tzaware_different_column(self):
|
||||
# with timezone in another column
|
||||
# GH#15522
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": pd.date_range("20130101", periods=4, tz="US/Eastern"),
|
||||
"B": [1, 2, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
result = df.fillna(method="pad")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": pd.date_range("20130101", periods=4, tz="US/Eastern"),
|
||||
"B": [1.0, 2.0, 2.0, 2.0],
|
||||
}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_na_actions_categorical(self):
|
||||
|
||||
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
|
||||
vals = ["a", "b", np.nan, "d"]
|
||||
df = DataFrame({"cats": cat, "vals": vals})
|
||||
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
|
||||
vals2 = ["a", "b", "b", "d"]
|
||||
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
|
||||
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
|
||||
vals3 = ["a", "b", np.nan]
|
||||
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
|
||||
cat4 = Categorical([1, 2], categories=[1, 2, 3])
|
||||
vals4 = ["a", "b"]
|
||||
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
|
||||
|
||||
# fillna
|
||||
res = df.fillna(value={"cats": 3, "vals": "b"})
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
with pytest.raises(ValueError, match=("fill value must be in categories")):
|
||||
df.fillna(value={"cats": 4, "vals": "c"})
|
||||
|
||||
res = df.fillna(method="pad")
|
||||
tm.assert_frame_equal(res, df_exp_fill)
|
||||
|
||||
# dropna
|
||||
res = df.dropna(subset=["cats"])
|
||||
tm.assert_frame_equal(res, df_exp_drop_cats)
|
||||
|
||||
res = df.dropna()
|
||||
tm.assert_frame_equal(res, df_exp_drop_all)
|
||||
|
||||
# make sure that fillna takes missing values into account
|
||||
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
|
||||
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
|
||||
|
||||
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
|
||||
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
|
||||
|
||||
res = df.fillna("a")
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
def test_fillna_categorical_nan(self):
|
||||
# GH 14021
|
||||
# np.nan should always be a valid filler
|
||||
cat = Categorical([np.nan, 2, np.nan])
|
||||
val = Categorical([np.nan, np.nan, np.nan])
|
||||
df = DataFrame({"cats": cat, "vals": val})
|
||||
res = df.fillna(df.median())
|
||||
v_exp = [np.nan, np.nan, np.nan]
|
||||
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
|
||||
tm.assert_frame_equal(res, df_exp)
|
||||
|
||||
result = df.cats.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.cats)
|
||||
result = df.vals.fillna(np.nan)
|
||||
tm.assert_series_equal(result, df.vals)
|
||||
|
||||
idx = pd.DatetimeIndex(
|
||||
["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT]
|
||||
)
|
||||
df = DataFrame({"a": Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
idx = pd.PeriodIndex(
|
||||
["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M"
|
||||
)
|
||||
df = DataFrame({"a": Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT])
|
||||
df = DataFrame({"a": Categorical(idx)})
|
||||
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||||
|
||||
def test_fillna_downcast(self):
|
||||
# GH 15277
|
||||
# infer int64 from float64
|
||||
df = pd.DataFrame({"a": [1.0, np.nan]})
|
||||
result = df.fillna(0, downcast="infer")
|
||||
expected = pd.DataFrame({"a": [1, 0]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# infer int64 from float64 when fillna value is a dict
|
||||
df = pd.DataFrame({"a": [1.0, np.nan]})
|
||||
result = df.fillna({"a": 0}, downcast="infer")
|
||||
expected = pd.DataFrame({"a": [1, 0]})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_dtype_conversion(self):
|
||||
# make sure that fillna on an empty frame works
|
||||
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
result = df.dtypes
|
||||
expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.fillna(1)
|
||||
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# empty block
|
||||
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
|
||||
result = df.fillna("nan")
|
||||
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# equiv of replace
|
||||
df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0]))
|
||||
for v in ["", 1, np.nan, 1.0]:
|
||||
expected = df.replace(np.nan, v)
|
||||
result = df.fillna(v)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_datetime_columns(self):
|
||||
# GH 7095
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [-1, -2, np.nan],
|
||||
"B": date_range("20130101", periods=3),
|
||||
"C": ["foo", "bar", None],
|
||||
"D": ["foo2", "bar2", None],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
result = df.fillna("?")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [-1, -2, "?"],
|
||||
"B": date_range("20130101", periods=3),
|
||||
"C": ["foo", "bar", "?"],
|
||||
"D": ["foo2", "bar2", "?"],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [-1, -2, np.nan],
|
||||
"B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT],
|
||||
"C": ["foo", "bar", None],
|
||||
"D": ["foo2", "bar2", None],
|
||||
},
|
||||
index=date_range("20130110", periods=3),
|
||||
)
|
||||
result = df.fillna("?")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [-1, -2, "?"],
|
||||
"B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"],
|
||||
"C": ["foo", "bar", "?"],
|
||||
"D": ["foo2", "bar2", "?"],
|
||||
},
|
||||
index=pd.date_range("20130110", periods=3),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ffill(self, datetime_frame):
|
||||
datetime_frame["A"][:5] = np.nan
|
||||
datetime_frame["A"][-5:] = np.nan
|
||||
|
||||
assert_frame_equal(
|
||||
datetime_frame.ffill(), datetime_frame.fillna(method="ffill")
|
||||
)
|
||||
|
||||
def test_bfill(self, datetime_frame):
|
||||
datetime_frame["A"][:5] = np.nan
|
||||
datetime_frame["A"][-5:] = np.nan
|
||||
|
||||
assert_frame_equal(
|
||||
datetime_frame.bfill(), datetime_frame.fillna(method="bfill")
|
||||
)
|
||||
|
||||
def test_frame_pad_backfill_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.randn(10, 4), index=index)
|
||||
|
||||
result = df[:2].reindex(index, method="pad", limit=5)
|
||||
|
||||
expected = df[:2].reindex(index).fillna(method="pad")
|
||||
expected.values[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index, method="backfill", limit=5)
|
||||
|
||||
expected = df[-2:].reindex(index).fillna(method="backfill")
|
||||
expected.values[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_fillna_limit(self):
|
||||
index = np.arange(10)
|
||||
df = DataFrame(np.random.randn(10, 4), index=index)
|
||||
|
||||
result = df[:2].reindex(index)
|
||||
result = result.fillna(method="pad", limit=5)
|
||||
|
||||
expected = df[:2].reindex(index).fillna(method="pad")
|
||||
expected.values[-3:] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df[-2:].reindex(index)
|
||||
result = result.fillna(method="backfill", limit=5)
|
||||
|
||||
expected = df[-2:].reindex(index).fillna(method="backfill")
|
||||
expected.values[:3] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_skip_certain_blocks(self):
|
||||
# don't try to fill boolean, int blocks
|
||||
|
||||
df = DataFrame(np.random.randn(10, 4).astype(int))
|
||||
|
||||
# it works!
|
||||
df.fillna(np.nan)
|
||||
|
||||
@pytest.mark.parametrize("type", [int, float])
|
||||
def test_fillna_positive_limit(self, type):
|
||||
df = DataFrame(np.random.randn(10, 4)).astype(type)
|
||||
|
||||
msg = "Limit must be greater than 0"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.fillna(0, limit=-5)
|
||||
|
||||
@pytest.mark.parametrize("type", [int, float])
|
||||
def test_fillna_integer_limit(self, type):
|
||||
df = DataFrame(np.random.randn(10, 4)).astype(type)
|
||||
|
||||
msg = "Limit must be an integer"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.fillna(0, limit=0.5)
|
||||
|
||||
def test_fillna_inplace(self):
|
||||
df = DataFrame(np.random.randn(10, 4))
|
||||
df[1][:4] = np.nan
|
||||
df[3][-4:] = np.nan
|
||||
|
||||
expected = df.fillna(value=0)
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(value=0, inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
expected = df.fillna(value={0: 0}, inplace=True)
|
||||
assert expected is None
|
||||
|
||||
df[1][:4] = np.nan
|
||||
df[3][-4:] = np.nan
|
||||
expected = df.fillna(method="ffill")
|
||||
assert expected is not df
|
||||
|
||||
df.fillna(method="ffill", inplace=True)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_fillna_dict_series(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 1, 2, np.nan, np.nan],
|
||||
"b": [1, 2, 3, np.nan, np.nan],
|
||||
"c": [np.nan, 1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.fillna({"a": 0, "b": 5})
|
||||
|
||||
expected = df.copy()
|
||||
expected["a"] = expected["a"].fillna(0)
|
||||
expected["b"] = expected["b"].fillna(5)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# it works
|
||||
result = df.fillna({"a": 0, "b": 5, "d": 7})
|
||||
|
||||
# Series treated same as dict
|
||||
result = df.fillna(df.max())
|
||||
expected = df.fillna(df.max().to_dict())
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# disable this for now
|
||||
with pytest.raises(NotImplementedError, match="column by column"):
|
||||
df.fillna(df.max(1), axis=1)
|
||||
|
||||
def test_fillna_dataframe(self):
|
||||
# GH 8377
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 1, 2, np.nan, np.nan],
|
||||
"b": [1, 2, 3, np.nan, np.nan],
|
||||
"c": [np.nan, 1, 2, 3, 4],
|
||||
},
|
||||
index=list("VWXYZ"),
|
||||
)
|
||||
|
||||
# df2 may have different index and columns
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 10, 20, 30, 40],
|
||||
"b": [50, 60, 70, 80, 90],
|
||||
"foo": ["bar"] * 5,
|
||||
},
|
||||
index=list("VWXuZ"),
|
||||
)
|
||||
|
||||
result = df.fillna(df2)
|
||||
|
||||
# only those columns and indices which are shared get filled
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [np.nan, 1, 2, np.nan, 40],
|
||||
"b": [1, 2, 3, np.nan, 90],
|
||||
"c": [np.nan, 1, 2, 3, 4],
|
||||
},
|
||||
index=list("VWXYZ"),
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_columns(self):
|
||||
df = DataFrame(np.random.randn(10, 10))
|
||||
df.values[:, ::2] = np.nan
|
||||
|
||||
result = df.fillna(method="ffill", axis=1)
|
||||
expected = df.T.fillna(method="pad").T
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df.insert(6, "foo", 5)
|
||||
result = df.fillna(method="ffill", axis=1)
|
||||
expected = df.astype(float).fillna(method="ffill", axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_fillna_invalid_method(self, float_frame):
|
||||
with pytest.raises(ValueError, match="ffil"):
|
||||
float_frame.fillna(method="ffil")
|
||||
|
||||
def test_fillna_invalid_value(self, float_frame):
|
||||
# list
|
||||
msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"'
|
||||
with pytest.raises(TypeError, match=msg.format("list")):
|
||||
float_frame.fillna([1, 2])
|
||||
# tuple
|
||||
with pytest.raises(TypeError, match=msg.format("tuple")):
|
||||
float_frame.fillna((1, 2))
|
||||
# frame with series
|
||||
msg = (
|
||||
'"value" parameter must be a scalar, dict or Series, but you'
|
||||
' passed a "DataFrame"'
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
float_frame.iloc[:, 0].fillna(float_frame)
|
||||
|
||||
def test_fillna_col_reordering(self):
|
||||
cols = ["COL." + str(i) for i in range(5, 0, -1)]
|
||||
data = np.random.rand(20, 5)
|
||||
df = DataFrame(index=range(20), columns=cols, data=data)
|
||||
filled = df.fillna(method="ffill")
|
||||
assert df.columns.tolist() == filled.columns.tolist()
|
||||
|
||||
def test_fill_corner(self, float_frame, float_string_frame):
|
||||
mf = float_string_frame
|
||||
mf.loc[mf.index[5:20], "foo"] = np.nan
|
||||
mf.loc[mf.index[-10:], "A"] = np.nan
|
||||
|
||||
filled = float_string_frame.fillna(value=0)
|
||||
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
|
||||
del float_string_frame["foo"]
|
||||
|
||||
empty_float = float_frame.reindex(columns=[])
|
||||
|
||||
# TODO(wesm): unused?
|
||||
result = empty_float.fillna(value=0) # noqa
|
||||
|
||||
def test_fill_value_when_combine_const(self):
|
||||
# GH12723
|
||||
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float")
|
||||
df = DataFrame({"foo": dat}, index=range(6))
|
||||
|
||||
exp = df.fillna(0).add(2)
|
||||
res = df.add(2, fill_value=0)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
class TestDataFrameInterpolate:
|
||||
def test_interp_basic(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [1.0, 2.0, 3.0, 4.0],
|
||||
"B": [1.0, 4.0, 9.0, 9.0],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
result = df.interpolate()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.set_index("C").interpolate()
|
||||
expected = df.set_index("C")
|
||||
expected.loc[3, "A"] = 3
|
||||
expected.loc[5, "B"] = 9
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_bad_method(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
df.interpolate(method="not_a_method")
|
||||
|
||||
def test_interp_combo(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1.0, 2.0, np.nan, 4.0],
|
||||
"B": [1, 4, 9, np.nan],
|
||||
"C": [1, 2, 3, 5],
|
||||
"D": list("abcd"),
|
||||
}
|
||||
)
|
||||
|
||||
result = df["A"].interpolate()
|
||||
expected = Series([1.0, 2.0, 3.0, 4.0], name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df["A"].interpolate(downcast="infer")
|
||||
expected = Series([1, 2, 3, 4], name="A")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_interp_nan_idx(self):
|
||||
df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]})
|
||||
df = df.set_index("A")
|
||||
with pytest.raises(NotImplementedError):
|
||||
df.interpolate(method="values")
|
||||
|
||||
@td.skip_if_no_scipy
|
||||
def test_interp_various(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
|
||||
)
|
||||
df = df.set_index("C")
|
||||
expected = df.copy()
|
||||
result = df.interpolate(method="polynomial", order=1)
|
||||
|
||||
expected.A.loc[3] = 2.66666667
|
||||
expected.A.loc[13] = 5.76923076
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="cubic")
|
||||
# GH #15662.
|
||||
expected.A.loc[3] = 2.81547781
|
||||
expected.A.loc[13] = 5.52964175
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="nearest")
|
||||
expected.A.loc[3] = 2
|
||||
expected.A.loc[13] = 5
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
result = df.interpolate(method="quadratic")
|
||||
expected.A.loc[3] = 2.82150771
|
||||
expected.A.loc[13] = 6.12648668
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="slinear")
|
||||
expected.A.loc[3] = 2.66666667
|
||||
expected.A.loc[13] = 5.76923077
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="zero")
|
||||
expected.A.loc[3] = 2.0
|
||||
expected.A.loc[13] = 5
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
@td.skip_if_no_scipy
|
||||
def test_interp_alt_scipy(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
|
||||
)
|
||||
result = df.interpolate(method="barycentric")
|
||||
expected = df.copy()
|
||||
expected.loc[2, "A"] = 3
|
||||
expected.loc[5, "A"] = 6
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(method="barycentric", downcast="infer")
|
||||
assert_frame_equal(result, expected.astype(np.int64))
|
||||
|
||||
result = df.interpolate(method="krogh")
|
||||
expectedk = df.copy()
|
||||
expectedk["A"] = expected["A"]
|
||||
assert_frame_equal(result, expectedk)
|
||||
|
||||
_skip_if_no_pchip()
|
||||
|
||||
result = df.interpolate(method="pchip")
|
||||
expected.loc[2, "A"] = 3
|
||||
expected.loc[5, "A"] = 6.0
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_rowwise(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [1, 2, np.nan, 4],
|
||||
1: [2, 3, 4, np.nan],
|
||||
2: [np.nan, 4, 5, 6],
|
||||
3: [4, np.nan, 6, 7],
|
||||
4: [1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
result = df.interpolate(axis=1)
|
||||
expected = df.copy()
|
||||
expected.loc[3, 1] = 5
|
||||
expected.loc[0, 2] = 3
|
||||
expected.loc[1, 3] = 3
|
||||
expected[4] = expected[4].astype(np.float64)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=1, method="values")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.interpolate(axis=0)
|
||||
expected = df.interpolate()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_rowwise_alt(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64],
|
||||
1: [1, 2, 3, 4, 3, 2, 1, 0, -1],
|
||||
}
|
||||
)
|
||||
df.interpolate(axis=0)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)]
|
||||
)
|
||||
def test_interp_leading_nans(self, check_scipy):
|
||||
df = DataFrame(
|
||||
{"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}
|
||||
)
|
||||
result = df.interpolate()
|
||||
expected = df.copy()
|
||||
expected["B"].loc[3] = -3.75
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
if check_scipy:
|
||||
result = df.interpolate(method="polynomial", order=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_raise_on_only_mixed(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": [np.nan, 2, 5, 7],
|
||||
"D": [np.nan, np.nan, 9, 9],
|
||||
"E": [1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
with pytest.raises(TypeError):
|
||||
df.interpolate(axis=1)
|
||||
|
||||
def test_interp_raise_on_all_object_dtype(self):
|
||||
# GH 22985
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
|
||||
msg = (
|
||||
"Cannot interpolate with all object-dtype columns "
|
||||
"in the DataFrame. Try setting at least one "
|
||||
"column to a numeric dtype."
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.interpolate()
|
||||
|
||||
def test_interp_inplace(self):
|
||||
df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
|
||||
expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
|
||||
result = df.copy()
|
||||
result["a"].interpolate(inplace=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.copy()
|
||||
result["a"].interpolate(inplace=True, downcast="infer")
|
||||
assert_frame_equal(result, expected.astype("int64"))
|
||||
|
||||
def test_interp_inplace_row(self):
|
||||
# GH 10395
|
||||
result = DataFrame(
|
||||
{"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
|
||||
)
|
||||
expected = result.interpolate(method="linear", axis=1, inplace=False)
|
||||
result.interpolate(method="linear", axis=1, inplace=True)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_interp_ignore_all_good(self):
|
||||
# GH
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, np.nan, 4],
|
||||
"B": [1, 2, 3, 4],
|
||||
"C": [1.0, 2.0, np.nan, 4.0],
|
||||
"D": [1.0, 2.0, 3.0, 4.0],
|
||||
}
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([1, 2, 3, 4], dtype="float64"),
|
||||
"B": np.array([1, 2, 3, 4], dtype="int64"),
|
||||
"C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
|
||||
"D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.interpolate(downcast=None)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# all good
|
||||
result = df[["B", "D"]].interpolate(downcast=None)
|
||||
assert_frame_equal(result, df[["B", "D"]])
|
||||
@@ -0,0 +1,287 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY36
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex, Series
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
# Column add, remove, delete.
|
||||
|
||||
|
||||
class TestDataFrameMutateColumns:
|
||||
def test_assign(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
original = df.copy()
|
||||
result = df.assign(C=df.B / df.A)
|
||||
expected = df.copy()
|
||||
expected["C"] = [4, 2.5, 2]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# lambda syntax
|
||||
result = df.assign(C=lambda x: x.B / x.A)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# original is unmodified
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
# Non-Series array-like
|
||||
result = df.assign(C=[4, 2.5, 2])
|
||||
assert_frame_equal(result, expected)
|
||||
# original is unmodified
|
||||
assert_frame_equal(df, original)
|
||||
|
||||
result = df.assign(B=df.B / df.A)
|
||||
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite
|
||||
result = df.assign(A=df.A + df.B)
|
||||
expected = df.copy()
|
||||
expected["A"] = [5, 7, 9]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# lambda
|
||||
result = df.assign(A=lambda x: x.A + x.B)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_multiple(self):
|
||||
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
|
||||
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||||
expected = DataFrame(
|
||||
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_order(self):
|
||||
# GH 9818
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
|
||||
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||||
|
||||
if PY36:
|
||||
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
|
||||
else:
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
|
||||
assert_frame_equal(result, expected)
|
||||
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_bad(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
|
||||
# non-keyword argument
|
||||
with pytest.raises(TypeError):
|
||||
df.assign(lambda x: x.A)
|
||||
with pytest.raises(AttributeError):
|
||||
df.assign(C=df.A, D=df.A + df.C)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
PY36,
|
||||
reason="""Issue #14207: valid for python
|
||||
3.6 and above""",
|
||||
)
|
||||
def test_assign_dependent_old_python(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
|
||||
# Key C does not exist at definition time of df
|
||||
with pytest.raises(KeyError, match="^'C'$"):
|
||||
df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
|
||||
with pytest.raises(KeyError, match="^'C'$"):
|
||||
df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not PY36,
|
||||
reason="""Issue #14207: not valid for
|
||||
python 3.5 and below""",
|
||||
)
|
||||
def test_assign_dependent(self):
|
||||
df = DataFrame({"A": [1, 2], "B": [3, 4]})
|
||||
|
||||
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_insert_error_msmgs(self):
|
||||
|
||||
# GH 7432
|
||||
df = DataFrame(
|
||||
{"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]}
|
||||
).set_index("foo")
|
||||
s = DataFrame(
|
||||
{"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]}
|
||||
).set_index("foo")
|
||||
msg = "cannot reindex from a duplicate axis"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df["newcol"] = s
|
||||
|
||||
# GH 4107, more descriptive error message
|
||||
df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
|
||||
|
||||
msg = "incompatible index of inserted column with frame index"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df["gr"] = df.groupby(["b", "c"]).count()
|
||||
|
||||
def test_insert_benchmark(self):
|
||||
# from the vb_suite/frame_methods/frame_insert_columns
|
||||
N = 10
|
||||
K = 5
|
||||
df = DataFrame(index=range(N))
|
||||
new_col = np.random.randn(N)
|
||||
for i in range(K):
|
||||
df[i] = new_col
|
||||
expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N))
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_insert(self):
|
||||
df = DataFrame(
|
||||
np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
|
||||
)
|
||||
|
||||
df.insert(0, "foo", df["a"])
|
||||
tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"]))
|
||||
tm.assert_series_equal(df["a"], df["foo"], check_names=False)
|
||||
|
||||
df.insert(2, "bar", df["c"])
|
||||
tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"]))
|
||||
tm.assert_almost_equal(df["c"], df["bar"], check_names=False)
|
||||
|
||||
# diff dtype
|
||||
|
||||
# new item
|
||||
df["x"] = df["a"].astype("float32")
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 5 + [np.dtype("float32")],
|
||||
index=["foo", "c", "bar", "b", "a", "x"],
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# replacing current (in different block)
|
||||
df["a"] = df["a"].astype("float32")
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
|
||||
index=["foo", "c", "bar", "b", "a", "x"],
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df["y"] = df["a"].astype("int32")
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")],
|
||||
index=["foo", "c", "bar", "b", "a", "x", "y"],
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="already exists"):
|
||||
df.insert(1, "a", df["b"])
|
||||
msg = "cannot insert c, already exists"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.insert(1, "c", df["b"])
|
||||
|
||||
df.columns.name = "some_name"
|
||||
# preserve columns name field
|
||||
df.insert(0, "baz", df["c"])
|
||||
assert df.columns.name == "some_name"
|
||||
|
||||
# GH 13522
|
||||
df = DataFrame(index=["A", "B", "C"])
|
||||
df["X"] = df.index
|
||||
df["X"] = ["x", "y", "z"]
|
||||
exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
|
||||
assert_frame_equal(df, exp)
|
||||
|
||||
def test_delitem(self, float_frame):
|
||||
del float_frame["A"]
|
||||
assert "A" not in float_frame
|
||||
|
||||
def test_delitem_multiindex(self):
|
||||
midx = MultiIndex.from_product([["A", "B"], [1, 2]])
|
||||
df = DataFrame(np.random.randn(4, 4), columns=midx)
|
||||
assert len(df.columns) == 4
|
||||
assert ("A",) in df.columns
|
||||
assert "A" in df.columns
|
||||
|
||||
result = df["A"]
|
||||
assert isinstance(result, DataFrame)
|
||||
del df["A"]
|
||||
|
||||
assert len(df.columns) == 2
|
||||
|
||||
# A still in the levels, BUT get a KeyError if trying
|
||||
# to delete
|
||||
assert ("A",) not in df.columns
|
||||
with pytest.raises(KeyError, match=re.escape("('A',)")):
|
||||
del df[("A",)]
|
||||
|
||||
# behavior of dropped/deleted MultiIndex levels changed from
|
||||
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
|
||||
# levels which are dropped/deleted
|
||||
assert "A" not in df.columns
|
||||
with pytest.raises(KeyError, match=re.escape("('A',)")):
|
||||
del df["A"]
|
||||
|
||||
def test_pop(self, float_frame):
|
||||
float_frame.columns.name = "baz"
|
||||
|
||||
float_frame.pop("A")
|
||||
assert "A" not in float_frame
|
||||
|
||||
float_frame["foo"] = "bar"
|
||||
float_frame.pop("foo")
|
||||
assert "foo" not in float_frame
|
||||
assert float_frame.columns.name == "baz"
|
||||
|
||||
# gh-10912: inplace ops cause caching issue
|
||||
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"])
|
||||
b = a.pop("B")
|
||||
b += 1
|
||||
|
||||
# original frame
|
||||
expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"])
|
||||
tm.assert_frame_equal(a, expected)
|
||||
|
||||
# result
|
||||
expected = Series([2, 5], index=["X", "Y"], name="B") + 1
|
||||
tm.assert_series_equal(b, expected)
|
||||
|
||||
def test_pop_non_unique_cols(self):
|
||||
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
|
||||
df.columns = ["a", "b", "a"]
|
||||
|
||||
res = df.pop("a")
|
||||
assert type(res) == DataFrame
|
||||
assert len(res) == 2
|
||||
assert len(df.columns) == 1
|
||||
assert "b" in df.columns
|
||||
assert "a" not in df.columns
|
||||
assert len(df.index) == 2
|
||||
|
||||
def test_insert_column_bug_4032(self):
|
||||
|
||||
# GH4032, inserting a column and renaming causing errors
|
||||
df = DataFrame({"b": [1.1, 2.2]})
|
||||
df = df.rename(columns={})
|
||||
df.insert(0, "a", [1, 2])
|
||||
|
||||
result = df.rename(columns={})
|
||||
str(result)
|
||||
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"])
|
||||
assert_frame_equal(result, expected)
|
||||
df.insert(0, "c", [1.3, 2.3])
|
||||
|
||||
result = df.rename(columns={})
|
||||
str(result)
|
||||
|
||||
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"])
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,528 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, MultiIndex, Series, date_range
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameNonuniqueIndexes(TestData):
|
||||
def test_column_dups_operations(self):
|
||||
def check(result, expected=None):
|
||||
if expected is not None:
|
||||
assert_frame_equal(result, expected)
|
||||
result.dtypes
|
||||
str(result)
|
||||
|
||||
# assignment
|
||||
# GH 3687
|
||||
arr = np.random.randn(3, 2)
|
||||
idx = list(range(2))
|
||||
df = DataFrame(arr, columns=["A", "A"])
|
||||
df.columns = idx
|
||||
expected = DataFrame(arr, columns=idx)
|
||||
check(df, expected)
|
||||
|
||||
idx = date_range("20130101", periods=4, freq="Q-NOV")
|
||||
df = DataFrame(
|
||||
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
|
||||
)
|
||||
df.columns = idx
|
||||
expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
|
||||
check(df, expected)
|
||||
|
||||
# insert
|
||||
df = DataFrame(
|
||||
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
||||
columns=["foo", "bar", "foo", "hello"],
|
||||
)
|
||||
df["string"] = "bah"
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
|
||||
columns=["foo", "bar", "foo", "hello", "string"],
|
||||
)
|
||||
check(df, expected)
|
||||
with pytest.raises(ValueError, match="Length of value"):
|
||||
df.insert(0, "AnotherColumn", range(len(df.index) - 1))
|
||||
|
||||
# insert same dtype
|
||||
df["foo2"] = 3
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
|
||||
columns=["foo", "bar", "foo", "hello", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# set (non-dup)
|
||||
df["foo2"] = 4
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
|
||||
columns=["foo", "bar", "foo", "hello", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
df["foo2"] = 3
|
||||
|
||||
# delete (non dup)
|
||||
del df["bar"]
|
||||
expected = DataFrame(
|
||||
[[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
|
||||
columns=["foo", "foo", "hello", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# try to delete again (its not consolidated)
|
||||
del df["hello"]
|
||||
expected = DataFrame(
|
||||
[[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
|
||||
columns=["foo", "foo", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# consolidate
|
||||
df = df._consolidate()
|
||||
expected = DataFrame(
|
||||
[[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
|
||||
columns=["foo", "foo", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# insert
|
||||
df.insert(2, "new_col", 5.0)
|
||||
expected = DataFrame(
|
||||
[[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
|
||||
columns=["foo", "foo", "new_col", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# insert a dup
|
||||
with pytest.raises(ValueError, match="cannot insert"):
|
||||
df.insert(2, "new_col", 4.0)
|
||||
|
||||
df.insert(2, "new_col", 4.0, allow_duplicates=True)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1, 4.0, 5.0, "bah", 3],
|
||||
[1, 2, 4.0, 5.0, "bah", 3],
|
||||
[2, 3, 4.0, 5.0, "bah", 3],
|
||||
],
|
||||
columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# delete (dup)
|
||||
del df["foo"]
|
||||
expected = DataFrame(
|
||||
[[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
|
||||
columns=["new_col", "new_col", "string", "foo2"],
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# dup across dtypes
|
||||
df = DataFrame(
|
||||
[[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
|
||||
columns=["foo", "bar", "foo", "hello"],
|
||||
)
|
||||
check(df)
|
||||
|
||||
df["foo2"] = 7.0
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
|
||||
columns=["foo", "bar", "foo", "hello", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
result = df["foo"]
|
||||
expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
|
||||
check(result, expected)
|
||||
|
||||
# multiple replacements
|
||||
df["foo"] = "string"
|
||||
expected = DataFrame(
|
||||
[
|
||||
["string", 1, "string", 5, 7.0],
|
||||
["string", 1, "string", 5, 7.0],
|
||||
["string", 1, "string", 5, 7.0],
|
||||
],
|
||||
columns=["foo", "bar", "foo", "hello", "foo2"],
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
del df["foo"]
|
||||
expected = DataFrame(
|
||||
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
|
||||
)
|
||||
check(df, expected)
|
||||
|
||||
# values
|
||||
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
|
||||
result = df.values
|
||||
expected = np.array([[1, 2.5], [3, 4.5]])
|
||||
assert (result == expected).all().all()
|
||||
|
||||
# rename, GH 4403
|
||||
df4 = DataFrame(
|
||||
{"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(600809, 20130331)], names=["STK_ID", "RPT_Date"]
|
||||
),
|
||||
)
|
||||
|
||||
df5 = DataFrame(
|
||||
{
|
||||
"RPT_Date": [20120930, 20121231, 20130331],
|
||||
"STK_ID": [600809] * 3,
|
||||
"STK_Name": ["饡驦", "饡驦", "饡驦"],
|
||||
"TClose": [38.05, 41.66, 30.01],
|
||||
},
|
||||
index=MultiIndex.from_tuples(
|
||||
[(600809, 20120930), (600809, 20121231), (600809, 20130331)],
|
||||
names=["STK_ID", "RPT_Date"],
|
||||
),
|
||||
)
|
||||
|
||||
k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
|
||||
result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
|
||||
str(result)
|
||||
result.dtypes
|
||||
|
||||
expected = DataFrame(
|
||||
[[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
|
||||
columns=[
|
||||
"RT",
|
||||
"TClose",
|
||||
"TExg",
|
||||
"RPT_Date",
|
||||
"STK_ID",
|
||||
"STK_Name",
|
||||
"QT_Close",
|
||||
],
|
||||
).set_index(["STK_ID", "RPT_Date"], drop=False)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# reindex is invalid!
|
||||
df = DataFrame(
|
||||
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
|
||||
)
|
||||
msg = "cannot reindex from a duplicate axis"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.reindex(columns=["bar"])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.reindex(columns=["bar", "foo"])
|
||||
|
||||
# drop
|
||||
df = DataFrame(
|
||||
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
|
||||
)
|
||||
result = df.drop(["a"], axis=1)
|
||||
expected = DataFrame([[1], [1], [1]], columns=["bar"])
|
||||
check(result, expected)
|
||||
result = df.drop("a", axis=1)
|
||||
check(result, expected)
|
||||
|
||||
# describe
|
||||
df = DataFrame(
|
||||
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=["bar", "a", "a"],
|
||||
dtype="float64",
|
||||
)
|
||||
result = df.describe()
|
||||
s = df.iloc[:, 0].describe()
|
||||
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
|
||||
check(result, expected)
|
||||
|
||||
# check column dups with index equal and not equal to df's index
|
||||
df = DataFrame(
|
||||
np.random.randn(5, 3),
|
||||
index=["a", "b", "c", "d", "e"],
|
||||
columns=["A", "B", "A"],
|
||||
)
|
||||
for index in [df.index, pd.Index(list("edcba"))]:
|
||||
this_df = df.copy()
|
||||
expected_ser = pd.Series(index.values, index=this_df.index)
|
||||
expected_df = DataFrame(
|
||||
{"A": expected_ser, "B": this_df["B"], "A": expected_ser},
|
||||
columns=["A", "B", "A"],
|
||||
)
|
||||
this_df["A"] = index
|
||||
check(this_df, expected_df)
|
||||
|
||||
# operations
|
||||
for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
|
||||
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
|
||||
expected = getattr(df, op)(df)
|
||||
expected.columns = ["A", "A"]
|
||||
df.columns = ["A", "A"]
|
||||
result = getattr(df, op)(df)
|
||||
check(result, expected)
|
||||
|
||||
# multiple assignments that change dtypes
|
||||
# the location indexer is a slice
|
||||
# GH 6120
|
||||
df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
|
||||
expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
|
||||
|
||||
df["that"] = 1.0
|
||||
check(df, expected)
|
||||
|
||||
df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
|
||||
expected = DataFrame(1, index=range(5), columns=["that", "that"])
|
||||
|
||||
df["that"] = 1
|
||||
check(df, expected)
|
||||
|
||||
def test_column_dups2(self):
|
||||
|
||||
# drop buggy GH 6240
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.random.randn(5),
|
||||
"B": np.random.randn(5),
|
||||
"C": np.random.randn(5),
|
||||
"D": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
|
||||
expected = df.take([0, 1, 1], axis=1)
|
||||
df2 = df.take([2, 0, 1, 2, 1], axis=1)
|
||||
result = df2.drop("C", axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# dropna
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.random.randn(5),
|
||||
"B": np.random.randn(5),
|
||||
"C": np.random.randn(5),
|
||||
"D": ["a", "b", "c", "d", "e"],
|
||||
}
|
||||
)
|
||||
df.iloc[2, [0, 1, 2]] = np.nan
|
||||
df.iloc[0, 0] = np.nan
|
||||
df.iloc[1, 1] = np.nan
|
||||
df.iloc[:, 3] = np.nan
|
||||
expected = df.dropna(subset=["A", "B", "C"], how="all")
|
||||
expected.columns = ["A", "A", "B", "C"]
|
||||
|
||||
df.columns = ["A", "A", "B", "C"]
|
||||
|
||||
result = df.dropna(subset=["A", "C"], how="all")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_column_dups_indexing(self):
|
||||
def check(result, expected=None):
|
||||
if expected is not None:
|
||||
assert_frame_equal(result, expected)
|
||||
result.dtypes
|
||||
str(result)
|
||||
|
||||
# boolean indexing
|
||||
# GH 4879
|
||||
dups = ["A", "A", "C", "D"]
|
||||
df = DataFrame(
|
||||
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
|
||||
)
|
||||
expected = df[df.C > 6]
|
||||
expected.columns = dups
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
|
||||
result = df[df.C > 6]
|
||||
check(result, expected)
|
||||
|
||||
# where
|
||||
df = DataFrame(
|
||||
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
|
||||
)
|
||||
expected = df[df > 6]
|
||||
expected.columns = dups
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
|
||||
result = df[df > 6]
|
||||
check(result, expected)
|
||||
|
||||
# boolean with the duplicate raises
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
|
||||
msg = "cannot reindex from a duplicate axis"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df[df.A > 6]
|
||||
|
||||
# dup aligning operations should work
|
||||
# GH 5185
|
||||
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
|
||||
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
|
||||
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
|
||||
result = df1.sub(df2)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# equality
|
||||
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
|
||||
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
|
||||
|
||||
# not-comparing like-labelled
|
||||
msg = "Can only compare identically-labeled DataFrame objects"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1 == df2
|
||||
|
||||
df1r = df1.reindex_like(df2)
|
||||
result = df1r == df2
|
||||
expected = DataFrame(
|
||||
[[False, True], [True, False], [False, False], [True, False]],
|
||||
columns=["A", "A"],
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# mixed column selection
|
||||
# GH 5639
|
||||
dfbool = DataFrame(
|
||||
{
|
||||
"one": Series([True, True, False], index=["a", "b", "c"]),
|
||||
"two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
|
||||
"three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
|
||||
}
|
||||
)
|
||||
expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
|
||||
result = dfbool[["one", "three", "one"]]
|
||||
check(result, expected)
|
||||
|
||||
# multi-axis dups
|
||||
# GH 6121
|
||||
df = DataFrame(
|
||||
np.arange(25.0).reshape(5, 5),
|
||||
index=["a", "b", "c", "d", "e"],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
z = df[["A", "C", "A"]].copy()
|
||||
expected = z.loc[["a", "c", "a"]]
|
||||
|
||||
df = DataFrame(
|
||||
np.arange(25.0).reshape(5, 5),
|
||||
index=["a", "b", "c", "d", "e"],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
z = df[["A", "C", "A"]]
|
||||
result = z.loc[["a", "c", "a"]]
|
||||
check(result, expected)
|
||||
|
||||
def test_column_dups_indexing2(self):
|
||||
|
||||
# GH 8363
|
||||
# datetime ops with a non-unique index
|
||||
df = DataFrame(
|
||||
{"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")},
|
||||
index=[2, 2, 3, 3, 4],
|
||||
)
|
||||
result = df.B - df.A
|
||||
expected = Series(1, index=[2, 2, 3, 3, 4])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": date_range("20130101", periods=5),
|
||||
"B": date_range("20130101 09:00:00", periods=5),
|
||||
},
|
||||
index=[2, 2, 3, 3, 4],
|
||||
)
|
||||
result = df.B - df.A
|
||||
expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_columns_with_dups(self):
|
||||
# GH 3468 related
|
||||
|
||||
# basic
|
||||
df = DataFrame([[1, 2]], columns=["a", "a"])
|
||||
df.columns = ["a", "a.1"]
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2]], columns=["a", "a.1"])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
|
||||
df.columns = ["b", "a", "a.1"]
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# with a dup index
|
||||
df = DataFrame([[1, 2]], columns=["a", "a"])
|
||||
df.columns = ["b", "b"]
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2]], columns=["b", "b"])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# multi-dtype
|
||||
df = DataFrame(
|
||||
[[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
|
||||
columns=["a", "a", "b", "b", "d", "c", "c"],
|
||||
)
|
||||
df.columns = list("ABCDEFG")
|
||||
str(df)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
|
||||
)
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
|
||||
df.columns = ["a", "a.1", "a.2", "a.3"]
|
||||
str(df)
|
||||
expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# dups across blocks
|
||||
df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
|
||||
df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
|
||||
df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
|
||||
df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
|
||||
df_dt = DataFrame(
|
||||
pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
|
||||
)
|
||||
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
|
||||
|
||||
assert len(df._data._blknos) == len(df.columns)
|
||||
assert len(df._data._blklocs) == len(df.columns)
|
||||
|
||||
# testing iloc
|
||||
for i in range(len(df.columns)):
|
||||
df.iloc[:, i]
|
||||
|
||||
# dup columns across dtype GH 2079/2194
|
||||
vals = [[1, -1, 2.0], [2, -2, 3.0]]
|
||||
rs = DataFrame(vals, columns=["A", "A", "B"])
|
||||
xp = DataFrame(vals)
|
||||
xp.columns = ["A", "A", "B"]
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_values_duplicates(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
|
||||
)
|
||||
|
||||
result = df.values
|
||||
expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_set_value_by_index(self):
|
||||
# See gh-12344
|
||||
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||||
df.columns = list("AAA")
|
||||
expected = df.iloc[:, 2]
|
||||
|
||||
df.iloc[:, 0] = 3
|
||||
assert_series_equal(df.iloc[:, 2], expected)
|
||||
|
||||
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||||
df.columns = [2, float(2), str(2)]
|
||||
expected = df.iloc[:, 1]
|
||||
|
||||
df.iloc[:, 0] = 3
|
||||
assert_series_equal(df.iloc[:, 1], expected)
|
||||
|
||||
def test_insert_with_columns_dups(self):
|
||||
# GH 14291
|
||||
df = pd.DataFrame()
|
||||
df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True)
|
||||
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
|
||||
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
|
||||
exp = pd.DataFrame(
|
||||
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
|
||||
)
|
||||
assert_frame_equal(df, exp)
|
||||
@@ -0,0 +1,884 @@
|
||||
from decimal import Decimal
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, MultiIndex, Series
|
||||
import pandas.core.common as com
|
||||
from pandas.tests.frame.common import _check_mixed_float
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal,
|
||||
assert_numpy_array_equal,
|
||||
assert_series_equal,
|
||||
)
|
||||
|
||||
|
||||
class TestDataFrameUnaryOperators:
|
||||
# __pos__, __neg__, __inv__
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df,expected",
|
||||
[
|
||||
(pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})),
|
||||
(pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})),
|
||||
(
|
||||
pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}),
|
||||
pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_neg_numeric(self, df, expected):
|
||||
assert_frame_equal(-df, expected)
|
||||
assert_series_equal(-df["a"], expected["a"])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df, expected",
|
||||
[
|
||||
(np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
|
||||
([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]),
|
||||
],
|
||||
)
|
||||
def test_neg_object(self, df, expected):
|
||||
# GH#21380
|
||||
df = pd.DataFrame({"a": df})
|
||||
expected = pd.DataFrame({"a": expected})
|
||||
assert_frame_equal(-df, expected)
|
||||
assert_series_equal(-df["a"], expected["a"])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df",
|
||||
[
|
||||
pd.DataFrame({"a": ["a", "b"]}),
|
||||
pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}),
|
||||
],
|
||||
)
|
||||
def test_neg_raises(self, df):
|
||||
with pytest.raises(TypeError):
|
||||
(-df)
|
||||
with pytest.raises(TypeError):
|
||||
(-df["a"])
|
||||
|
||||
def test_invert(self, float_frame):
|
||||
df = float_frame
|
||||
|
||||
assert_frame_equal(-(df < 0), ~(df < 0))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df",
|
||||
[
|
||||
pd.DataFrame({"a": [-1, 1]}),
|
||||
pd.DataFrame({"a": [False, True]}),
|
||||
pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}),
|
||||
],
|
||||
)
|
||||
def test_pos_numeric(self, df):
|
||||
# GH#16073
|
||||
assert_frame_equal(+df, df)
|
||||
assert_series_equal(+df["a"], df["a"])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df",
|
||||
[
|
||||
# numpy changing behavior in the future
|
||||
pytest.param(
|
||||
pd.DataFrame({"a": ["a", "b"]}),
|
||||
marks=[pytest.mark.filterwarnings("ignore")],
|
||||
),
|
||||
pd.DataFrame({"a": np.array([-1, 2], dtype=object)}),
|
||||
pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}),
|
||||
],
|
||||
)
|
||||
def test_pos_object(self, df):
|
||||
# GH#21380
|
||||
assert_frame_equal(+df, df)
|
||||
assert_series_equal(+df["a"], df["a"])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})]
|
||||
)
|
||||
def test_pos_raises(self, df):
|
||||
with pytest.raises(TypeError):
|
||||
(+df)
|
||||
with pytest.raises(TypeError):
|
||||
(+df["a"])
|
||||
|
||||
|
||||
class TestDataFrameLogicalOperators:
|
||||
# &, |, ^
|
||||
|
||||
def test_logical_ops_empty_frame(self):
|
||||
# GH#5808
|
||||
# empty frames, non-mixed dtype
|
||||
df = DataFrame(index=[1])
|
||||
|
||||
result = df & df
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
result = df | df
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df2 = DataFrame(index=[1, 2])
|
||||
result = df & df2
|
||||
assert_frame_equal(result, df2)
|
||||
|
||||
dfa = DataFrame(index=[1], columns=["A"])
|
||||
|
||||
result = dfa & dfa
|
||||
assert_frame_equal(result, dfa)
|
||||
|
||||
def test_logical_ops_bool_frame(self):
|
||||
# GH#5808
|
||||
df1a_bool = DataFrame(True, index=[1], columns=["A"])
|
||||
|
||||
result = df1a_bool & df1a_bool
|
||||
assert_frame_equal(result, df1a_bool)
|
||||
|
||||
result = df1a_bool | df1a_bool
|
||||
assert_frame_equal(result, df1a_bool)
|
||||
|
||||
def test_logical_ops_int_frame(self):
|
||||
# GH#5808
|
||||
df1a_int = DataFrame(1, index=[1], columns=["A"])
|
||||
df1a_bool = DataFrame(True, index=[1], columns=["A"])
|
||||
|
||||
result = df1a_int | df1a_bool
|
||||
assert_frame_equal(result, df1a_int)
|
||||
|
||||
def test_logical_ops_invalid(self):
|
||||
# GH#5808
|
||||
|
||||
df1 = DataFrame(1.0, index=[1], columns=["A"])
|
||||
df2 = DataFrame(True, index=[1], columns=["A"])
|
||||
with pytest.raises(TypeError):
|
||||
df1 | df2
|
||||
|
||||
df1 = DataFrame("foo", index=[1], columns=["A"])
|
||||
df2 = DataFrame(True, index=[1], columns=["A"])
|
||||
with pytest.raises(TypeError):
|
||||
df1 | df2
|
||||
|
||||
def test_logical_operators(self):
|
||||
def _check_bin_op(op):
|
||||
result = op(df1, df2)
|
||||
expected = DataFrame(
|
||||
op(df1.values, df2.values), index=df1.index, columns=df1.columns
|
||||
)
|
||||
assert result.values.dtype == np.bool_
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def _check_unary_op(op):
|
||||
result = op(df1)
|
||||
expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns)
|
||||
assert result.values.dtype == np.bool_
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
df1 = {
|
||||
"a": {"a": True, "b": False, "c": False, "d": True, "e": True},
|
||||
"b": {"a": False, "b": True, "c": False, "d": False, "e": False},
|
||||
"c": {"a": False, "b": False, "c": True, "d": False, "e": False},
|
||||
"d": {"a": True, "b": False, "c": False, "d": True, "e": True},
|
||||
"e": {"a": True, "b": False, "c": False, "d": True, "e": True},
|
||||
}
|
||||
|
||||
df2 = {
|
||||
"a": {"a": True, "b": False, "c": True, "d": False, "e": False},
|
||||
"b": {"a": False, "b": True, "c": False, "d": False, "e": False},
|
||||
"c": {"a": True, "b": False, "c": True, "d": False, "e": False},
|
||||
"d": {"a": False, "b": False, "c": False, "d": True, "e": False},
|
||||
"e": {"a": False, "b": False, "c": False, "d": False, "e": True},
|
||||
}
|
||||
|
||||
df1 = DataFrame(df1)
|
||||
df2 = DataFrame(df2)
|
||||
|
||||
_check_bin_op(operator.and_)
|
||||
_check_bin_op(operator.or_)
|
||||
_check_bin_op(operator.xor)
|
||||
|
||||
_check_unary_op(operator.inv) # TODO: belongs elsewhere
|
||||
|
||||
def test_logical_with_nas(self):
|
||||
d = DataFrame({"a": [np.nan, False], "b": [True, True]})
|
||||
|
||||
# GH4947
|
||||
# bool comparisons should return bool
|
||||
result = d["a"] | d["b"]
|
||||
expected = Series([False, True])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# GH4604, automatic casting here
|
||||
result = d["a"].fillna(False) | d["b"]
|
||||
expected = Series([True, True])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = d["a"].fillna(False, downcast=False) | d["b"]
|
||||
expected = Series([True, True])
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
class TestDataFrameOperators:
|
||||
@pytest.mark.parametrize(
|
||||
"op", [operator.add, operator.sub, operator.mul, operator.truediv]
|
||||
)
|
||||
def test_operators_none_as_na(self, op):
|
||||
df = DataFrame(
|
||||
{"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object
|
||||
)
|
||||
|
||||
# since filling converts dtypes from object, changed expected to be
|
||||
# object
|
||||
filled = df.fillna(np.nan)
|
||||
result = op(df, 3)
|
||||
expected = op(filled, 3).astype(object)
|
||||
expected[com.isna(expected)] = None
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = op(df, df)
|
||||
expected = op(filled, filled).astype(object)
|
||||
expected[com.isna(expected)] = None
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = op(df, df.fillna(7))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = op(df.fillna(7), df)
|
||||
assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
@pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)])
|
||||
# TODO: not sure what's correct here.
|
||||
@pytest.mark.filterwarnings("ignore:elementwise:FutureWarning")
|
||||
def test_logical_typeerror_with_non_valid(self, op, res, float_frame):
|
||||
# we are comparing floats vs a string
|
||||
result = getattr(float_frame, op)("foo")
|
||||
assert bool(result.all().all()) is res
|
||||
|
||||
def test_binary_ops_align(self):
|
||||
|
||||
# test aligning binary ops
|
||||
|
||||
# GH 6681
|
||||
index = MultiIndex.from_product(
|
||||
[list("abc"), ["one", "two", "three"], [1, 2, 3]],
|
||||
names=["first", "second", "third"],
|
||||
)
|
||||
|
||||
df = DataFrame(
|
||||
np.arange(27 * 3).reshape(27, 3),
|
||||
index=index,
|
||||
columns=["value1", "value2", "value3"],
|
||||
).sort_index()
|
||||
|
||||
idx = pd.IndexSlice
|
||||
for op in ["add", "sub", "mul", "div", "truediv"]:
|
||||
opa = getattr(operator, op, None)
|
||||
if opa is None:
|
||||
continue
|
||||
|
||||
x = Series([1.0, 10.0, 100.0], [1, 2, 3])
|
||||
result = getattr(df, op)(x, level="third", axis=0)
|
||||
|
||||
expected = pd.concat(
|
||||
[opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()]
|
||||
).sort_index()
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
x = Series([1.0, 10.0], ["two", "three"])
|
||||
result = getattr(df, op)(x, level="second", axis=0)
|
||||
|
||||
expected = (
|
||||
pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()])
|
||||
.reindex_like(df)
|
||||
.sort_index()
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# GH9463 (alignment level of dataframe with series)
|
||||
|
||||
midx = MultiIndex.from_product([["A", "B"], ["a", "b"]])
|
||||
df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx)
|
||||
s = pd.Series({"a": 1, "b": 2})
|
||||
|
||||
df2 = df.copy()
|
||||
df2.columns.names = ["lvl0", "lvl1"]
|
||||
s2 = s.copy()
|
||||
s2.index.name = "lvl1"
|
||||
|
||||
# different cases of integer/string level names:
|
||||
res1 = df.mul(s, axis=1, level=1)
|
||||
res2 = df.mul(s2, axis=1, level=1)
|
||||
res3 = df2.mul(s, axis=1, level=1)
|
||||
res4 = df2.mul(s2, axis=1, level=1)
|
||||
res5 = df2.mul(s, axis=1, level="lvl1")
|
||||
res6 = df2.mul(s2, axis=1, level="lvl1")
|
||||
|
||||
exp = DataFrame(
|
||||
np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx
|
||||
)
|
||||
|
||||
for res in [res1, res2]:
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
exp.columns.names = ["lvl0", "lvl1"]
|
||||
for res in [res3, res4, res5, res6]:
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_dti_tz_convert_to_utc(self):
|
||||
base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC")
|
||||
idx1 = base.tz_convert("Asia/Tokyo")[:2]
|
||||
idx2 = base.tz_convert("US/Eastern")[1:]
|
||||
|
||||
df1 = DataFrame({"A": [1, 2]}, index=idx1)
|
||||
df2 = DataFrame({"A": [1, 1]}, index=idx2)
|
||||
exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base)
|
||||
assert_frame_equal(df1 + df2, exp)
|
||||
|
||||
def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame):
|
||||
frame_copy = float_frame.reindex(float_frame.index[::2])
|
||||
|
||||
del frame_copy["D"]
|
||||
frame_copy["C"][:5] = np.nan
|
||||
|
||||
added = float_frame + frame_copy
|
||||
|
||||
indexer = added["A"].dropna().index
|
||||
exp = (float_frame["A"] * 2).copy()
|
||||
|
||||
tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer])
|
||||
|
||||
exp.loc[~exp.index.isin(indexer)] = np.nan
|
||||
tm.assert_series_equal(added["A"], exp.loc[added["A"].index])
|
||||
|
||||
assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all()
|
||||
|
||||
# assert(False)
|
||||
|
||||
assert np.isnan(added["D"]).all()
|
||||
|
||||
self_added = float_frame + float_frame
|
||||
tm.assert_index_equal(self_added.index, float_frame.index)
|
||||
|
||||
added_rev = frame_copy + float_frame
|
||||
assert np.isnan(added["D"]).all()
|
||||
assert np.isnan(added_rev["D"]).all()
|
||||
|
||||
# corner cases
|
||||
|
||||
# empty
|
||||
plus_empty = float_frame + DataFrame()
|
||||
assert np.isnan(plus_empty.values).all()
|
||||
|
||||
empty_plus = DataFrame() + float_frame
|
||||
assert np.isnan(empty_plus.values).all()
|
||||
|
||||
empty_empty = DataFrame() + DataFrame()
|
||||
assert empty_empty.empty
|
||||
|
||||
# out of order
|
||||
reverse = float_frame.reindex(columns=float_frame.columns[::-1])
|
||||
|
||||
assert_frame_equal(reverse + float_frame, float_frame * 2)
|
||||
|
||||
# mix vs float64, upcast
|
||||
added = float_frame + mixed_float_frame
|
||||
_check_mixed_float(added, dtype="float64")
|
||||
added = mixed_float_frame + float_frame
|
||||
_check_mixed_float(added, dtype="float64")
|
||||
|
||||
# mix vs mix
|
||||
added = mixed_float_frame + mixed_float_frame
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
|
||||
# with int
|
||||
added = float_frame + mixed_int_frame
|
||||
_check_mixed_float(added, dtype="float64")
|
||||
|
||||
def test_combineSeries(
|
||||
self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame
|
||||
):
|
||||
|
||||
# Series
|
||||
series = float_frame.xs(float_frame.index[0])
|
||||
|
||||
added = float_frame + series
|
||||
|
||||
for key, s in added.items():
|
||||
assert_series_equal(s, float_frame[key] + series[key])
|
||||
|
||||
larger_series = series.to_dict()
|
||||
larger_series["E"] = 1
|
||||
larger_series = Series(larger_series)
|
||||
larger_added = float_frame + larger_series
|
||||
|
||||
for key, s in float_frame.items():
|
||||
assert_series_equal(larger_added[key], s + series[key])
|
||||
assert "E" in larger_added
|
||||
assert np.isnan(larger_added["E"]).all()
|
||||
|
||||
# no upcast needed
|
||||
added = mixed_float_frame + series
|
||||
_check_mixed_float(added)
|
||||
|
||||
# vs mix (upcast) as needed
|
||||
added = mixed_float_frame + series.astype("float32")
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
added = mixed_float_frame + series.astype("float16")
|
||||
_check_mixed_float(added, dtype=dict(C=None))
|
||||
|
||||
# these raise with numexpr.....as we are adding an int64 to an
|
||||
# uint64....weird vs int
|
||||
|
||||
# added = mixed_int_frame + (100*series).astype('int64')
|
||||
# _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
|
||||
# 'int64', D = 'int64'))
|
||||
# added = mixed_int_frame + (100*series).astype('int32')
|
||||
# _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
|
||||
# 'int32', D = 'int64'))
|
||||
|
||||
# TimeSeries
|
||||
ts = datetime_frame["A"]
|
||||
|
||||
# 10890
|
||||
# we no longer allow auto timeseries broadcasting
|
||||
# and require explicit broadcasting
|
||||
added = datetime_frame.add(ts, axis="index")
|
||||
|
||||
for key, col in datetime_frame.items():
|
||||
result = col + ts
|
||||
assert_series_equal(added[key], result, check_names=False)
|
||||
assert added[key].name == key
|
||||
if col.name == ts.name:
|
||||
assert result.name == "A"
|
||||
else:
|
||||
assert result.name is None
|
||||
|
||||
smaller_frame = datetime_frame[:-5]
|
||||
smaller_added = smaller_frame.add(ts, axis="index")
|
||||
|
||||
tm.assert_index_equal(smaller_added.index, datetime_frame.index)
|
||||
|
||||
smaller_ts = ts[:-5]
|
||||
smaller_added2 = datetime_frame.add(smaller_ts, axis="index")
|
||||
assert_frame_equal(smaller_added, smaller_added2)
|
||||
|
||||
# length 0, result is all-nan
|
||||
result = datetime_frame.add(ts[:0], axis="index")
|
||||
expected = DataFrame(
|
||||
np.nan, index=datetime_frame.index, columns=datetime_frame.columns
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Frame is all-nan
|
||||
result = datetime_frame[:0].add(ts, axis="index")
|
||||
expected = DataFrame(
|
||||
np.nan, index=datetime_frame.index, columns=datetime_frame.columns
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# empty but with non-empty index
|
||||
frame = datetime_frame[:1].reindex(columns=[])
|
||||
result = frame.mul(ts, axis="index")
|
||||
assert len(result) == len(ts)
|
||||
|
||||
def test_combineFunc(self, float_frame, mixed_float_frame):
|
||||
result = float_frame * 2
|
||||
tm.assert_numpy_array_equal(result.values, float_frame.values * 2)
|
||||
|
||||
# vs mix
|
||||
result = mixed_float_frame * 2
|
||||
for c, s in result.items():
|
||||
tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2)
|
||||
_check_mixed_float(result, dtype=dict(C=None))
|
||||
|
||||
result = DataFrame() * 2
|
||||
assert result.index.equals(DataFrame().index)
|
||||
assert len(result.columns) == 0
|
||||
|
||||
def test_comparisons(self, simple_frame, float_frame):
|
||||
df1 = tm.makeTimeDataFrame()
|
||||
df2 = tm.makeTimeDataFrame()
|
||||
|
||||
row = simple_frame.xs("a")
|
||||
ndim_5 = np.ones(df1.shape + (1, 1, 1))
|
||||
|
||||
def test_comp(func):
|
||||
result = func(df1, df2)
|
||||
tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values))
|
||||
|
||||
with pytest.raises(ValueError, match="dim must be <= 2"):
|
||||
func(df1, ndim_5)
|
||||
|
||||
result2 = func(simple_frame, row)
|
||||
tm.assert_numpy_array_equal(
|
||||
result2.values, func(simple_frame.values, row.values)
|
||||
)
|
||||
|
||||
result3 = func(float_frame, 0)
|
||||
tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0))
|
||||
|
||||
msg = "Can only compare identically-labeled DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
func(simple_frame, simple_frame[:2])
|
||||
|
||||
test_comp(operator.eq)
|
||||
test_comp(operator.ne)
|
||||
test_comp(operator.lt)
|
||||
test_comp(operator.gt)
|
||||
test_comp(operator.ge)
|
||||
test_comp(operator.le)
|
||||
|
||||
def test_comparison_protected_from_errstate(self):
|
||||
missing_df = tm.makeDataFrame()
|
||||
missing_df.iloc[0]["A"] = np.nan
|
||||
with np.errstate(invalid="ignore"):
|
||||
expected = missing_df.values < 0
|
||||
with np.errstate(invalid="raise"):
|
||||
result = (missing_df < 0).values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_boolean_comparison(self):
|
||||
|
||||
# GH 4576
|
||||
# boolean comparisons with a tuple/list give unexpected results
|
||||
df = DataFrame(np.arange(6).reshape((3, 2)))
|
||||
b = np.array([2, 2])
|
||||
b_r = np.atleast_2d([2, 2])
|
||||
b_c = b_r.T
|
||||
lst = [2, 2, 2]
|
||||
tup = tuple(lst)
|
||||
|
||||
# gt
|
||||
expected = DataFrame([[False, False], [False, True], [True, True]])
|
||||
result = df > b
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.values > b
|
||||
assert_numpy_array_equal(result, expected.values)
|
||||
|
||||
msg1d = "Unable to coerce to Series, length must be 2: given 3"
|
||||
msg2d = "Unable to coerce to DataFrame, shape must be"
|
||||
msg2db = "operands could not be broadcast together with shapes"
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
# wrong shape
|
||||
df > lst
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
# wrong shape
|
||||
result = df > tup
|
||||
|
||||
# broadcasts like ndarray (GH#23000)
|
||||
result = df > b_r
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.values > b_r
|
||||
assert_numpy_array_equal(result, expected.values)
|
||||
|
||||
with pytest.raises(ValueError, match=msg2d):
|
||||
df > b_c
|
||||
|
||||
with pytest.raises(ValueError, match=msg2db):
|
||||
df.values > b_c
|
||||
|
||||
# ==
|
||||
expected = DataFrame([[False, False], [True, False], [False, False]])
|
||||
result = df == b
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == lst
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == tup
|
||||
|
||||
# broadcasts like ndarray (GH#23000)
|
||||
result = df == b_r
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.values == b_r
|
||||
assert_numpy_array_equal(result, expected.values)
|
||||
|
||||
with pytest.raises(ValueError, match=msg2d):
|
||||
df == b_c
|
||||
|
||||
assert df.values.shape != b_c.shape
|
||||
|
||||
# with alignment
|
||||
df = DataFrame(
|
||||
np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc")
|
||||
)
|
||||
expected.index = df.index
|
||||
expected.columns = df.columns
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == lst
|
||||
|
||||
with pytest.raises(ValueError, match=msg1d):
|
||||
result = df == tup
|
||||
|
||||
def test_combine_generic(self, float_frame):
|
||||
df1 = float_frame
|
||||
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
combined = df1.combine(df2, np.add)
|
||||
combined2 = df2.combine(df1, np.add)
|
||||
assert combined["D"].isna().all()
|
||||
assert combined2["D"].isna().all()
|
||||
|
||||
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
|
||||
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
exp = (
|
||||
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
|
||||
* 2
|
||||
)
|
||||
assert_frame_equal(chunk, exp)
|
||||
assert_frame_equal(chunk2, exp)
|
||||
|
||||
def test_inplace_ops_alignment(self):
|
||||
|
||||
# inplace ops / ops alignment
|
||||
# GH 8511
|
||||
|
||||
columns = list("abcdefg")
|
||||
X_orig = DataFrame(
|
||||
np.arange(10 * len(columns)).reshape(-1, len(columns)),
|
||||
columns=columns,
|
||||
index=range(10),
|
||||
)
|
||||
Z = 100 * X_orig.iloc[:, 1:-1].copy()
|
||||
block1 = list("bedcf")
|
||||
subs = list("bcdef")
|
||||
|
||||
# add
|
||||
X = X_orig.copy()
|
||||
result1 = (X[block1] + Z).reindex(columns=subs)
|
||||
|
||||
X[block1] += Z
|
||||
result2 = X.reindex(columns=subs)
|
||||
|
||||
X = X_orig.copy()
|
||||
result3 = (X[block1] + Z[block1]).reindex(columns=subs)
|
||||
|
||||
X[block1] += Z[block1]
|
||||
result4 = X.reindex(columns=subs)
|
||||
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
assert_frame_equal(result1, result4)
|
||||
|
||||
# sub
|
||||
X = X_orig.copy()
|
||||
result1 = (X[block1] - Z).reindex(columns=subs)
|
||||
|
||||
X[block1] -= Z
|
||||
result2 = X.reindex(columns=subs)
|
||||
|
||||
X = X_orig.copy()
|
||||
result3 = (X[block1] - Z[block1]).reindex(columns=subs)
|
||||
|
||||
X[block1] -= Z[block1]
|
||||
result4 = X.reindex(columns=subs)
|
||||
|
||||
assert_frame_equal(result1, result2)
|
||||
assert_frame_equal(result1, result3)
|
||||
assert_frame_equal(result1, result4)
|
||||
|
||||
def test_inplace_ops_identity(self):
|
||||
|
||||
# GH 5104
|
||||
# make sure that we are actually changing the object
|
||||
s_orig = Series([1, 2, 3])
|
||||
df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))
|
||||
|
||||
# no dtype change
|
||||
s = s_orig.copy()
|
||||
s2 = s
|
||||
s += 1
|
||||
assert_series_equal(s, s2)
|
||||
assert_series_equal(s_orig + 1, s)
|
||||
assert s is s2
|
||||
assert s._data is s2._data
|
||||
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df += 1
|
||||
assert_frame_equal(df, df2)
|
||||
assert_frame_equal(df_orig + 1, df)
|
||||
assert df is df2
|
||||
assert df._data is df2._data
|
||||
|
||||
# dtype change
|
||||
s = s_orig.copy()
|
||||
s2 = s
|
||||
s += 1.5
|
||||
assert_series_equal(s, s2)
|
||||
assert_series_equal(s_orig + 1.5, s)
|
||||
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df += 1.5
|
||||
assert_frame_equal(df, df2)
|
||||
assert_frame_equal(df_orig + 1.5, df)
|
||||
assert df is df2
|
||||
assert df._data is df2._data
|
||||
|
||||
# mixed dtype
|
||||
arr = np.random.randint(0, 10, size=5)
|
||||
df_orig = DataFrame({"A": arr.copy(), "B": "foo"})
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df["A"] += 1
|
||||
expected = DataFrame({"A": arr.copy() + 1, "B": "foo"})
|
||||
assert_frame_equal(df, expected)
|
||||
assert_frame_equal(df2, expected)
|
||||
assert df._data is df2._data
|
||||
|
||||
df = df_orig.copy()
|
||||
df2 = df
|
||||
df["A"] += 1.5
|
||||
expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"})
|
||||
assert_frame_equal(df, expected)
|
||||
assert_frame_equal(df2, expected)
|
||||
assert df._data is df2._data
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op",
|
||||
[
|
||||
"add",
|
||||
"and",
|
||||
"div",
|
||||
"floordiv",
|
||||
"mod",
|
||||
"mul",
|
||||
"or",
|
||||
"pow",
|
||||
"sub",
|
||||
"truediv",
|
||||
"xor",
|
||||
],
|
||||
)
|
||||
def test_inplace_ops_identity2(self, op):
|
||||
|
||||
if op == "div":
|
||||
return
|
||||
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]})
|
||||
|
||||
operand = 2
|
||||
if op in ("and", "or", "xor"):
|
||||
# cannot use floats for boolean ops
|
||||
df["a"] = [True, False, True]
|
||||
|
||||
df_copy = df.copy()
|
||||
iop = "__i{}__".format(op)
|
||||
op = "__{}__".format(op)
|
||||
|
||||
# no id change and value is correct
|
||||
getattr(df, iop)(operand)
|
||||
expected = getattr(df_copy, op)(operand)
|
||||
assert_frame_equal(df, expected)
|
||||
expected = id(df)
|
||||
assert id(df) == expected
|
||||
|
||||
def test_alignment_non_pandas(self):
|
||||
index = ["A", "B", "C"]
|
||||
columns = ["X", "Y", "Z"]
|
||||
df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)
|
||||
|
||||
align = pd.core.ops._align_method_FRAME
|
||||
for val in [
|
||||
[1, 2, 3],
|
||||
(1, 2, 3),
|
||||
np.array([1, 2, 3], dtype=np.int64),
|
||||
range(1, 4),
|
||||
]:
|
||||
|
||||
tm.assert_series_equal(
|
||||
align(df, val, "index"), Series([1, 2, 3], index=df.index)
|
||||
)
|
||||
tm.assert_series_equal(
|
||||
align(df, val, "columns"), Series([1, 2, 3], index=df.columns)
|
||||
)
|
||||
|
||||
# length mismatch
|
||||
msg = "Unable to coerce to Series, length must be 3: given 2"
|
||||
for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]:
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, "index")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, "columns")
|
||||
|
||||
val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
tm.assert_frame_equal(
|
||||
align(df, val, "index"), DataFrame(val, index=df.index, columns=df.columns)
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
align(df, val, "columns"),
|
||||
DataFrame(val, index=df.index, columns=df.columns),
|
||||
)
|
||||
|
||||
# shape mismatch
|
||||
msg = "Unable to coerce to DataFrame, shape must be"
|
||||
val = np.array([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, "index")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
align(df, val, "columns")
|
||||
|
||||
val = np.zeros((3, 3, 3))
|
||||
with pytest.raises(ValueError):
|
||||
align(df, val, "index")
|
||||
with pytest.raises(ValueError):
|
||||
align(df, val, "columns")
|
||||
|
||||
def test_no_warning(self, all_arithmetic_operators):
|
||||
df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]})
|
||||
b = df["B"]
|
||||
with tm.assert_produces_warning(None):
|
||||
getattr(df, all_arithmetic_operators)(b, 0)
|
||||
|
||||
|
||||
class TestTranspose:
|
||||
def test_transpose_tzaware_1col_single_tz(self):
|
||||
# GH#26825
|
||||
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
|
||||
df = pd.DataFrame(dti)
|
||||
assert (df.dtypes == dti.dtype).all()
|
||||
res = df.T
|
||||
assert (res.dtypes == dti.dtype).all()
|
||||
|
||||
def test_transpose_tzaware_2col_single_tz(self):
|
||||
# GH#26825
|
||||
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
|
||||
df3 = pd.DataFrame({"A": dti, "B": dti})
|
||||
assert (df3.dtypes == dti.dtype).all()
|
||||
res3 = df3.T
|
||||
assert (res3.dtypes == dti.dtype).all()
|
||||
|
||||
def test_transpose_tzaware_2col_mixed_tz(self):
|
||||
# GH#26825
|
||||
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
dti2 = dti.tz_convert("US/Pacific")
|
||||
|
||||
df4 = pd.DataFrame({"A": dti, "B": dti2})
|
||||
assert (df4.dtypes == [dti.dtype, dti2.dtype]).all()
|
||||
assert (df4.T.dtypes == object).all()
|
||||
tm.assert_frame_equal(df4.T.T, df4)
|
||||
|
||||
def test_transpose_object_to_tzaware_mixed_tz(self):
|
||||
# GH#26825
|
||||
dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC")
|
||||
dti2 = dti.tz_convert("US/Pacific")
|
||||
|
||||
# mixed all-tzaware dtypes
|
||||
df2 = pd.DataFrame([dti, dti2])
|
||||
assert (df2.dtypes == object).all()
|
||||
res2 = df2.T
|
||||
assert (res2.dtypes == [dti.dtype, dti2.dtype]).all()
|
||||
@@ -0,0 +1,156 @@
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
PeriodIndex,
|
||||
Timedelta,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def _permute(obj):
|
||||
return obj.take(np.random.permutation(len(obj)))
|
||||
|
||||
|
||||
class TestPeriodIndex:
|
||||
def test_as_frame_columns(self):
|
||||
rng = period_range("1/1/2000", periods=5)
|
||||
df = DataFrame(np.random.randn(10, 5), columns=rng)
|
||||
|
||||
ts = df[rng[0]]
|
||||
tm.assert_series_equal(ts, df.iloc[:, 0])
|
||||
|
||||
# GH # 1211
|
||||
repr(df)
|
||||
|
||||
ts = df["1/1/2000"]
|
||||
tm.assert_series_equal(ts, df.iloc[:, 0])
|
||||
|
||||
def test_frame_setitem(self):
|
||||
rng = period_range("1/1/2000", periods=5, name="index")
|
||||
df = DataFrame(np.random.randn(5, 3), index=rng)
|
||||
|
||||
df["Index"] = rng
|
||||
rs = Index(df["Index"])
|
||||
tm.assert_index_equal(rs, rng, check_names=False)
|
||||
assert rs.name == "Index"
|
||||
assert rng.name == "index"
|
||||
|
||||
rs = df.reset_index().set_index("index")
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
tm.assert_index_equal(rs.index, rng)
|
||||
|
||||
def test_frame_to_time_stamp(self):
|
||||
K = 5
|
||||
index = period_range(freq="A", start="1/1/2001", end="12/1/2009")
|
||||
df = DataFrame(np.random.randn(len(index), K), index=index)
|
||||
df["mix"] = "a"
|
||||
|
||||
exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC")
|
||||
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
|
||||
result = df.to_timestamp("D", "end")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN")
|
||||
result = df.to_timestamp("D", "start")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def _get_with_delta(delta, freq="A-DEC"):
|
||||
return date_range(
|
||||
to_datetime("1/1/2001") + delta,
|
||||
to_datetime("12/31/2009") + delta,
|
||||
freq=freq,
|
||||
)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp("H", "end")
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp("T", "end")
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = df.to_timestamp("S", "end")
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
# columns
|
||||
df = df.T
|
||||
|
||||
exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC")
|
||||
exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns")
|
||||
result = df.to_timestamp("D", "end", axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
tm.assert_numpy_array_equal(result.values, df.values)
|
||||
|
||||
exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN")
|
||||
result = df.to_timestamp("D", "start", axis=1)
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23)
|
||||
result = df.to_timestamp("H", "end", axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
delta = timedelta(hours=23, minutes=59)
|
||||
result = df.to_timestamp("T", "end", axis=1)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
result = df.to_timestamp("S", "end", axis=1)
|
||||
delta = timedelta(hours=23, minutes=59, seconds=59)
|
||||
exp_index = _get_with_delta(delta)
|
||||
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
|
||||
tm.assert_index_equal(result.columns, exp_index)
|
||||
|
||||
# invalid axis
|
||||
with pytest.raises(ValueError, match="axis"):
|
||||
df.to_timestamp(axis=2)
|
||||
|
||||
result1 = df.to_timestamp("5t", axis=1)
|
||||
result2 = df.to_timestamp("t", axis=1)
|
||||
expected = pd.date_range("2001-01-01", "2009-01-01", freq="AS")
|
||||
assert isinstance(result1.columns, DatetimeIndex)
|
||||
assert isinstance(result2.columns, DatetimeIndex)
|
||||
tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8)
|
||||
tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8)
|
||||
# PeriodIndex.to_timestamp always use 'infer'
|
||||
assert result1.columns.freqstr == "AS-JAN"
|
||||
assert result2.columns.freqstr == "AS-JAN"
|
||||
|
||||
def test_frame_index_to_string(self):
|
||||
index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M")
|
||||
frame = DataFrame(np.random.randn(3, 4), index=index)
|
||||
|
||||
# it works!
|
||||
frame.to_string()
|
||||
|
||||
def test_align_frame(self):
|
||||
rng = period_range("1/1/2000", "1/1/2010", freq="A")
|
||||
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
|
||||
|
||||
result = ts + ts[::2]
|
||||
expected = ts + ts
|
||||
expected.values[1::2] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = ts + _permute(ts[::2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,469 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series, Timestamp
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameQuantile:
|
||||
def test_quantile(self, datetime_frame):
|
||||
from numpy import percentile
|
||||
|
||||
df = datetime_frame
|
||||
q = df.quantile(0.1, axis=0)
|
||||
assert q["A"] == percentile(df["A"], 10)
|
||||
tm.assert_index_equal(q.index, df.columns)
|
||||
|
||||
q = df.quantile(0.9, axis=1)
|
||||
assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90)
|
||||
tm.assert_index_equal(q.index, df.index)
|
||||
|
||||
# test degenerate case
|
||||
q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0)
|
||||
assert np.isnan(q["x"]) and np.isnan(q["y"])
|
||||
|
||||
# non-numeric exclusion
|
||||
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
|
||||
rs = df.quantile(0.5)
|
||||
xp = df.median().rename(0.5)
|
||||
assert_series_equal(rs, xp)
|
||||
|
||||
# axis
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(0.5, axis=1)
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile([0.5, 0.75], axis=1)
|
||||
expected = DataFrame(
|
||||
{1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
|
||||
)
|
||||
assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
# We may want to break API in the future to change this
|
||||
# so that we exclude non-numeric along the same axis
|
||||
# See GH #7312
|
||||
df = DataFrame([[1, 2, 3], ["a", "b", 4]])
|
||||
result = df.quantile(0.5, axis=1)
|
||||
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
def test_quantile_axis_mixed(self):
|
||||
|
||||
# mixed on axis=1
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": [2.0, 3.0, 4.0],
|
||||
"C": pd.date_range("20130101", periods=3),
|
||||
"D": ["foo", "bar", "baz"],
|
||||
}
|
||||
)
|
||||
result = df.quantile(0.5, axis=1)
|
||||
expected = Series([1.5, 2.5, 3.5], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# must raise
|
||||
with pytest.raises(TypeError):
|
||||
df.quantile(0.5, axis=1, numeric_only=False)
|
||||
|
||||
def test_quantile_axis_parameter(self):
|
||||
# GH 9543/9544
|
||||
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
|
||||
result = df.quantile(0.5, axis=0)
|
||||
|
||||
expected = Series([2.0, 3.0], index=["A", "B"], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
expected = df.quantile(0.5, axis="index")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(0.5, axis=1)
|
||||
|
||||
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df.quantile(0.5, axis="columns")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
msg = "No axis named -1 for object type <class 'pandas.core.frame.DataFrame'>"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.quantile(0.1, axis=-1)
|
||||
msg = (
|
||||
"No axis named column for object type"
|
||||
" <class 'pandas.core.frame.DataFrame'>"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.quantile(0.1, axis="column")
|
||||
|
||||
def test_quantile_interpolation(self):
|
||||
# see gh-10174
|
||||
|
||||
# interpolation method other than default linear
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
||||
result = df.quantile(0.5, axis=1, interpolation="nearest")
|
||||
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# cross-check interpolation=nearest results in original dtype
|
||||
exp = np.percentile(
|
||||
np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="nearest"
|
||||
)
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# float
|
||||
df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3])
|
||||
result = df.quantile(0.5, axis=1, interpolation="nearest")
|
||||
expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5)
|
||||
tm.assert_series_equal(result, expected)
|
||||
exp = np.percentile(
|
||||
np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
|
||||
0.5,
|
||||
axis=0,
|
||||
interpolation="nearest",
|
||||
)
|
||||
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64")
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# axis
|
||||
result = df.quantile([0.5, 0.75], axis=1, interpolation="lower")
|
||||
expected = DataFrame(
|
||||
{1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75]
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test degenerate case
|
||||
df = DataFrame({"x": [], "y": []})
|
||||
q = df.quantile(0.1, axis=0, interpolation="higher")
|
||||
assert np.isnan(q["x"]) and np.isnan(q["y"])
|
||||
|
||||
# multi
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
||||
result = df.quantile([0.25, 0.5], interpolation="midpoint")
|
||||
|
||||
# https://github.com/numpy/numpy/issues/7163
|
||||
expected = DataFrame(
|
||||
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
||||
index=[0.25, 0.5],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_interpolation_datetime(self, datetime_frame):
|
||||
# see gh-10174
|
||||
|
||||
# interpolation = linear (default case)
|
||||
df = datetime_frame
|
||||
q = df.quantile(0.1, axis=0, interpolation="linear")
|
||||
assert q["A"] == np.percentile(df["A"], 10)
|
||||
|
||||
def test_quantile_interpolation_int(self, int_frame):
|
||||
# see gh-10174
|
||||
|
||||
df = int_frame
|
||||
# interpolation = linear (default case)
|
||||
q = df.quantile(0.1)
|
||||
assert q["A"] == np.percentile(df["A"], 10)
|
||||
|
||||
# test with and without interpolation keyword
|
||||
q1 = df.quantile(0.1, axis=0, interpolation="linear")
|
||||
assert q1["A"] == np.percentile(df["A"], 10)
|
||||
tm.assert_series_equal(q, q1)
|
||||
|
||||
def test_quantile_multi(self):
|
||||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
|
||||
result = df.quantile([0.25, 0.5])
|
||||
expected = DataFrame(
|
||||
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
||||
index=[0.25, 0.5],
|
||||
columns=["a", "b", "c"],
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
result = df.quantile([0.25, 0.5], axis=1)
|
||||
expected = DataFrame(
|
||||
[[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2]
|
||||
)
|
||||
|
||||
# empty
|
||||
result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0)
|
||||
expected = DataFrame(
|
||||
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_quantile_datetime(self):
|
||||
df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]})
|
||||
|
||||
# exclude datetime
|
||||
result = df.quantile(0.5)
|
||||
expected = Series([2.5], index=["b"])
|
||||
|
||||
# datetime
|
||||
result = df.quantile(0.5, numeric_only=False)
|
||||
expected = Series(
|
||||
[Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
# datetime w/ multi
|
||||
result = df.quantile([0.5], numeric_only=False)
|
||||
expected = DataFrame(
|
||||
[[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"]
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis = 1
|
||||
df["c"] = pd.to_datetime(["2011", "2012"])
|
||||
result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
|
||||
expected = Series(
|
||||
[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
|
||||
index=[0, 1],
|
||||
name=0.5,
|
||||
)
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
|
||||
expected = DataFrame(
|
||||
[[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
|
||||
index=[0.5],
|
||||
columns=[0, 1],
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# empty when numeric_only=True
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# result = df[['a', 'c']].quantile(.5)
|
||||
# result = df[['a', 'c']].quantile([.5])
|
||||
|
||||
def test_quantile_invalid(self, datetime_frame):
|
||||
msg = "percentiles should all be in the interval \\[0, 1\\]"
|
||||
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
datetime_frame.quantile(invalid)
|
||||
|
||||
def test_quantile_box(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-03"),
|
||||
],
|
||||
"B": [
|
||||
pd.Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"C": [
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
|
||||
exp = pd.Series(
|
||||
[
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
],
|
||||
name=0.5,
|
||||
index=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = pd.DataFrame(
|
||||
[
|
||||
[
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
]
|
||||
],
|
||||
index=[0.5],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# DatetimeBlock may be consolidated and contain NaT in different loc
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.NaT,
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-03"),
|
||||
],
|
||||
"a": [
|
||||
pd.Timestamp("2011-01-01"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.NaT,
|
||||
pd.Timestamp("2011-01-03"),
|
||||
],
|
||||
"B": [
|
||||
pd.Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
pd.NaT,
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"b": [
|
||||
pd.Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.NaT,
|
||||
pd.Timestamp("2011-01-03", tz="US/Eastern"),
|
||||
],
|
||||
"C": [
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
pd.NaT,
|
||||
],
|
||||
"c": [
|
||||
pd.NaT,
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
],
|
||||
},
|
||||
columns=list("AaBbCc"),
|
||||
)
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = pd.Series(
|
||||
[
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
],
|
||||
name=0.5,
|
||||
index=list("AaBbCc"),
|
||||
)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = pd.DataFrame(
|
||||
[
|
||||
[
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-02"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
]
|
||||
],
|
||||
index=[0.5],
|
||||
columns=list("AaBbCc"),
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nan(self):
|
||||
|
||||
# GH 14357 - float block where some cols have missing values
|
||||
df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
|
||||
df.iloc[-1, 1] = np.nan
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([3.0, 2.5], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75])
|
||||
exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.quantile(0.5, axis=1)
|
||||
exp = Series(np.arange(1.0, 6.0), name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75], axis=1)
|
||||
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# full-nan column
|
||||
df["b"] = np.nan
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5, 0.75])
|
||||
exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_nat(self):
|
||||
|
||||
# full NaT column
|
||||
df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series([pd.NaT], index=["a"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame({"a": [pd.NaT]}, index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# mixed non-null / full null column
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [
|
||||
pd.Timestamp("2012-01-01"),
|
||||
pd.Timestamp("2012-01-02"),
|
||||
pd.Timestamp("2012-01-03"),
|
||||
],
|
||||
"b": [pd.NaT, pd.NaT, pd.NaT],
|
||||
}
|
||||
)
|
||||
|
||||
res = df.quantile(0.5, numeric_only=False)
|
||||
exp = Series([pd.Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5], numeric_only=False)
|
||||
exp = DataFrame(
|
||||
[[pd.Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"]
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_quantile_empty(self):
|
||||
|
||||
# floats
|
||||
df = DataFrame(columns=["a", "b"], dtype="float64")
|
||||
|
||||
res = df.quantile(0.5)
|
||||
exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.quantile([0.5])
|
||||
exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# res = df.quantile(0.5, axis=1)
|
||||
# res = df.quantile([0.5], axis=1)
|
||||
|
||||
# ints
|
||||
df = DataFrame(columns=["a", "b"], dtype="int64")
|
||||
|
||||
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
||||
# res = df.quantile(0.5)
|
||||
|
||||
# datetimes
|
||||
df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")
|
||||
|
||||
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
|
||||
# res = df.quantile(0.5, numeric_only=False)
|
||||
File diff suppressed because it is too large
Load Diff
316
venv/lib/python3.6/site-packages/pandas/tests/frame/test_rank.py
Normal file
316
venv/lib/python3.6/site-packages/pandas/tests/frame/test_rank.py
Normal file
@@ -0,0 +1,316 @@
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
class TestRank:
|
||||
s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
results = {
|
||||
"average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
|
||||
"min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
|
||||
"max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
|
||||
"first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
|
||||
"dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
|
||||
}
|
||||
|
||||
@pytest.fixture(params=["average", "min", "max", "first", "dense"])
|
||||
def method(self, request):
|
||||
"""
|
||||
Fixture for trying all rank methods
|
||||
"""
|
||||
return request.param
|
||||
|
||||
def test_rank(self, float_frame):
|
||||
rankdata = pytest.importorskip("scipy.stats.rankdata")
|
||||
|
||||
float_frame["A"][::2] = np.nan
|
||||
float_frame["B"][::3] = np.nan
|
||||
float_frame["C"][::4] = np.nan
|
||||
float_frame["D"][::5] = np.nan
|
||||
|
||||
ranks0 = float_frame.rank()
|
||||
ranks1 = float_frame.rank(1)
|
||||
mask = np.isnan(float_frame.values)
|
||||
|
||||
fvals = float_frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fvals)
|
||||
exp0[mask] = np.nan
|
||||
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fvals)
|
||||
exp1[mask] = np.nan
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# integers
|
||||
df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))
|
||||
|
||||
result = df.rank()
|
||||
exp = df.astype(float).rank()
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = df.rank(1)
|
||||
exp = df.astype(float).rank(1)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
def test_rank2(self):
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
|
||||
result = df.rank(1, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 3, 2], [1, 2, 3]])
|
||||
expected = df.rank(0) / 2.0
|
||||
result = df.rank(0, pct=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
|
||||
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
|
||||
result = df.rank(1, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
|
||||
result = df.rank(0, numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# f7u12, this does not work without extensive workaround
|
||||
data = [
|
||||
[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
|
||||
]
|
||||
df = DataFrame(data)
|
||||
|
||||
# check the rank
|
||||
expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
|
||||
result = df.rank(1, numeric_only=False, ascending=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
|
||||
result = df.rank(1, numeric_only=False, ascending=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
|
||||
exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
|
||||
tm.assert_frame_equal(df.rank(), exp)
|
||||
|
||||
def test_rank_mixed_frame(self, float_string_frame):
|
||||
float_string_frame["datetime"] = datetime.now()
|
||||
float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
|
||||
|
||||
result = float_string_frame.rank(1)
|
||||
expected = float_string_frame.rank(1, numeric_only=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_rank_na_option(self, float_frame):
|
||||
rankdata = pytest.importorskip("scipy.stats.rankdata")
|
||||
|
||||
float_frame["A"][::2] = np.nan
|
||||
float_frame["B"][::3] = np.nan
|
||||
float_frame["C"][::4] = np.nan
|
||||
float_frame["D"][::5] = np.nan
|
||||
|
||||
# bottom
|
||||
ranks0 = float_frame.rank(na_option="bottom")
|
||||
ranks1 = float_frame.rank(1, na_option="bottom")
|
||||
|
||||
fvals = float_frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fvals)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# top
|
||||
ranks0 = float_frame.rank(na_option="top")
|
||||
ranks1 = float_frame.rank(1, na_option="top")
|
||||
|
||||
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
|
||||
fval1 = float_frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, fval0)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, fval1)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# bottom
|
||||
ranks0 = float_frame.rank(na_option="top", ascending=False)
|
||||
ranks1 = float_frame.rank(1, na_option="top", ascending=False)
|
||||
|
||||
fvals = float_frame.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, -fvals)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, -fvals)
|
||||
|
||||
tm.assert_almost_equal(ranks0.values, exp0)
|
||||
tm.assert_almost_equal(ranks1.values, exp1)
|
||||
|
||||
# descending
|
||||
|
||||
# top
|
||||
ranks0 = float_frame.rank(na_option="bottom", ascending=False)
|
||||
ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)
|
||||
|
||||
fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
|
||||
fval1 = float_frame.T
|
||||
fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
|
||||
fval1 = fval1.fillna(np.inf).values
|
||||
|
||||
exp0 = np.apply_along_axis(rankdata, 0, -fval0)
|
||||
exp1 = np.apply_along_axis(rankdata, 1, -fval1)
|
||||
|
||||
tm.assert_numpy_array_equal(ranks0.values, exp0)
|
||||
tm.assert_numpy_array_equal(ranks1.values, exp1)
|
||||
|
||||
# bad values throw error
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.rank(na_option="bad", ascending=False)
|
||||
|
||||
# invalid type
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.rank(na_option=True, ascending=False)
|
||||
|
||||
def test_rank_axis(self):
|
||||
# check if using axes' names gives the same result
|
||||
df = DataFrame([[2, 1], [4, 3]])
|
||||
tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
|
||||
tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))
|
||||
|
||||
def test_rank_methods_frame(self):
|
||||
pytest.importorskip("scipy.stats.special")
|
||||
rankdata = pytest.importorskip("scipy.stats.rankdata")
|
||||
|
||||
xs = np.random.randint(0, 21, (100, 26))
|
||||
xs = (xs - 10.0) / 10.0
|
||||
cols = [chr(ord("z") - i) for i in range(xs.shape[1])]
|
||||
|
||||
for vals in [xs, xs + 1e6, xs * 1e-6]:
|
||||
df = DataFrame(vals, columns=cols)
|
||||
|
||||
for ax in [0, 1]:
|
||||
for m in ["average", "min", "max", "first", "dense"]:
|
||||
result = df.rank(axis=ax, method=m)
|
||||
sprank = np.apply_along_axis(
|
||||
rankdata, ax, vals, m if m != "first" else "ordinal"
|
||||
)
|
||||
sprank = sprank.astype(np.float64)
|
||||
expected = DataFrame(sprank, columns=cols).astype("float64")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
|
||||
def test_rank_descending(self, method, dtype):
|
||||
|
||||
if "i" in dtype:
|
||||
df = self.df.dropna()
|
||||
else:
|
||||
df = self.df.astype(dtype)
|
||||
|
||||
res = df.rank(ascending=False)
|
||||
expected = (df.max() - df).rank()
|
||||
assert_frame_equal(res, expected)
|
||||
|
||||
if method == "first" and dtype == "O":
|
||||
return
|
||||
|
||||
expected = (df.max() - df).rank(method=method)
|
||||
|
||||
if dtype != "O":
|
||||
res2 = df.rank(method=method, ascending=False, numeric_only=True)
|
||||
assert_frame_equal(res2, expected)
|
||||
|
||||
res3 = df.rank(method=method, ascending=False, numeric_only=False)
|
||||
assert_frame_equal(res3, expected)
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("dtype", [None, object])
|
||||
def test_rank_2d_tie_methods(self, method, axis, dtype):
|
||||
df = self.df
|
||||
|
||||
def _check2d(df, expected, method="average", axis=0):
|
||||
exp_df = DataFrame({"A": expected, "B": expected})
|
||||
|
||||
if axis == 1:
|
||||
df = df.T
|
||||
exp_df = exp_df.T
|
||||
|
||||
result = df.rank(method=method, axis=axis)
|
||||
assert_frame_equal(result, exp_df)
|
||||
|
||||
disabled = {(object, "first")}
|
||||
if (dtype, method) in disabled:
|
||||
return
|
||||
frame = df if dtype is None else df.astype(dtype)
|
||||
_check2d(frame, self.results[method], method=method, axis=axis)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,exp",
|
||||
[
|
||||
("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]),
|
||||
(
|
||||
"min",
|
||||
[
|
||||
[1.0 / 3, 1.0, 1.0],
|
||||
[1.0 / 3, 1.0 / 3, 2.0 / 3],
|
||||
[1.0 / 3, 1.0 / 3, 1.0 / 3],
|
||||
],
|
||||
),
|
||||
(
|
||||
"max",
|
||||
[[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
[[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
[
|
||||
[1.0 / 3, 1.0, 1.0],
|
||||
[2.0 / 3, 1.0 / 3, 2.0 / 3],
|
||||
[3.0 / 3, 2.0 / 3, 1.0 / 3],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_rank_pct_true(self, method, exp):
|
||||
# see gh-15630.
|
||||
|
||||
df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
|
||||
result = df.rank(method=method, pct=True)
|
||||
|
||||
expected = DataFrame(exp)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.single
|
||||
@pytest.mark.high_memory
|
||||
def test_pct_max_many_rows(self):
|
||||
# GH 18271
|
||||
df = DataFrame(
|
||||
{"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)}
|
||||
)
|
||||
result = df.rank(pct=True).max()
|
||||
assert (result == 1).all()
|
||||
1292
venv/lib/python3.6/site-packages/pandas/tests/frame/test_replace.py
Normal file
1292
venv/lib/python3.6/site-packages/pandas/tests/frame/test_replace.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,549 @@
|
||||
from datetime import datetime, timedelta
|
||||
from io import StringIO
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PYPY
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
option_context,
|
||||
period_range,
|
||||
)
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.formats.format as fmt
|
||||
|
||||
# Segregated collection of methods that require the BlockManager internal data
|
||||
# structure
|
||||
|
||||
|
||||
class TestDataFrameReprInfoEtc(TestData):
|
||||
def test_repr_empty(self):
|
||||
# empty
|
||||
foo = repr(self.empty) # noqa
|
||||
|
||||
# empty with index
|
||||
frame = DataFrame(index=np.arange(1000))
|
||||
foo = repr(frame) # noqa
|
||||
|
||||
def test_repr_mixed(self):
|
||||
buf = StringIO()
|
||||
|
||||
# mixed
|
||||
foo = repr(self.mixed_frame) # noqa
|
||||
self.mixed_frame.info(verbose=False, buf=buf)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_repr_mixed_big(self):
|
||||
# big mixed
|
||||
biggie = DataFrame(
|
||||
{"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
|
||||
)
|
||||
biggie.loc[:20, "A"] = np.nan
|
||||
biggie.loc[:20, "B"] = np.nan
|
||||
|
||||
foo = repr(biggie) # noqa
|
||||
|
||||
def test_repr(self):
|
||||
buf = StringIO()
|
||||
|
||||
# small one
|
||||
foo = repr(self.frame)
|
||||
self.frame.info(verbose=False, buf=buf)
|
||||
|
||||
# even smaller
|
||||
self.frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
|
||||
self.frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
|
||||
|
||||
# exhausting cases in DataFrame.info
|
||||
|
||||
# columns but no index
|
||||
no_index = DataFrame(columns=[0, 1, 3])
|
||||
foo = repr(no_index) # noqa
|
||||
|
||||
# no columns or index
|
||||
self.empty.info(buf=buf)
|
||||
|
||||
df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
|
||||
assert "\t" not in repr(df)
|
||||
assert "\r" not in repr(df)
|
||||
assert "a\n" not in repr(df)
|
||||
|
||||
def test_repr_dimensions(self):
|
||||
df = DataFrame([[1, 2], [3, 4]])
|
||||
with option_context("display.show_dimensions", True):
|
||||
assert "2 rows x 2 columns" in repr(df)
|
||||
|
||||
with option_context("display.show_dimensions", False):
|
||||
assert "2 rows x 2 columns" not in repr(df)
|
||||
|
||||
with option_context("display.show_dimensions", "truncate"):
|
||||
assert "2 rows x 2 columns" not in repr(df)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_repr_big(self):
|
||||
# big one
|
||||
biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
|
||||
repr(biggie)
|
||||
|
||||
def test_repr_unsortable(self):
|
||||
# columns are not sortable
|
||||
import warnings
|
||||
|
||||
warn_filters = warnings.filters
|
||||
warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")
|
||||
|
||||
unsortable = DataFrame(
|
||||
{
|
||||
"foo": [1] * 50,
|
||||
datetime.today(): [1] * 50,
|
||||
"bar": ["bar"] * 50,
|
||||
datetime.today() + timedelta(1): ["bar"] * 50,
|
||||
},
|
||||
index=np.arange(50),
|
||||
)
|
||||
repr(unsortable)
|
||||
|
||||
fmt.set_option("display.precision", 3, "display.column_space", 10)
|
||||
repr(self.frame)
|
||||
|
||||
fmt.set_option("display.max_rows", 10, "display.max_columns", 2)
|
||||
repr(self.frame)
|
||||
|
||||
fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000)
|
||||
repr(self.frame)
|
||||
|
||||
tm.reset_display_options()
|
||||
|
||||
warnings.filters = warn_filters
|
||||
|
||||
def test_repr_unicode(self):
|
||||
uval = "\u03c3\u03c3\u03c3\u03c3"
|
||||
|
||||
# TODO(wesm): is this supposed to be used?
|
||||
bval = uval.encode("utf-8") # noqa
|
||||
|
||||
df = DataFrame({"A": [uval, uval]})
|
||||
|
||||
result = repr(df)
|
||||
ex_top = " A"
|
||||
assert result.split("\n")[0].rstrip() == ex_top
|
||||
|
||||
df = DataFrame({"A": [uval, uval]})
|
||||
result = repr(df)
|
||||
assert result.split("\n")[0].rstrip() == ex_top
|
||||
|
||||
def test_unicode_string_with_unicode(self):
|
||||
df = DataFrame({"A": ["\u05d0"]})
|
||||
str(df)
|
||||
|
||||
def test_str_to_bytes_raises(self):
|
||||
# GH 26447
|
||||
df = DataFrame({"A": ["abc"]})
|
||||
msg = "^'str' object cannot be interpreted as an integer$"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
bytes(df)
|
||||
|
||||
def test_very_wide_info_repr(self):
|
||||
df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
|
||||
repr(df)
|
||||
|
||||
def test_repr_column_name_unicode_truncation_bug(self):
|
||||
# #1906
|
||||
df = DataFrame(
|
||||
{
|
||||
"Id": [7117434],
|
||||
"StringCol": (
|
||||
"Is it possible to modify drop plot code"
|
||||
" so that the output graph is displayed "
|
||||
"in iphone simulator, Is it possible to "
|
||||
"modify drop plot code so that the "
|
||||
"output graph is \xe2\x80\xa8displayed "
|
||||
"in iphone simulator.Now we are adding "
|
||||
"the CSV file externally. I want to Call"
|
||||
" the File through the code.."
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
with option_context("display.max_columns", 20):
|
||||
assert "StringCol" in repr(df)
|
||||
|
||||
def test_latex_repr(self):
|
||||
result = r"""\begin{tabular}{llll}
|
||||
\toprule
|
||||
{} & 0 & 1 & 2 \\
|
||||
\midrule
|
||||
0 & $\alpha$ & b & c \\
|
||||
1 & 1 & 2 & 3 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
"""
|
||||
with option_context("display.latex.escape", False, "display.latex.repr", True):
|
||||
df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
|
||||
assert result == df._repr_latex_()
|
||||
|
||||
# GH 12182
|
||||
assert df._repr_latex_() is None
|
||||
|
||||
def test_info(self):
|
||||
io = StringIO()
|
||||
self.frame.info(buf=io)
|
||||
self.tsframe.info(buf=io)
|
||||
|
||||
frame = DataFrame(np.random.randn(5, 3))
|
||||
|
||||
frame.info()
|
||||
frame.info(verbose=False)
|
||||
|
||||
def test_info_memory(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/21056
|
||||
df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
result = buf.getvalue()
|
||||
bytes = float(df.memory_usage().sum())
|
||||
|
||||
expected = textwrap.dedent(
|
||||
"""\
|
||||
<class 'pandas.core.frame.DataFrame'>
|
||||
RangeIndex: 2 entries, 0 to 1
|
||||
Data columns (total 1 columns):
|
||||
a 2 non-null int64
|
||||
dtypes: int64(1)
|
||||
memory usage: {} bytes
|
||||
""".format(
|
||||
bytes
|
||||
)
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_info_wide(self):
|
||||
from pandas import set_option, reset_option
|
||||
|
||||
io = StringIO()
|
||||
df = DataFrame(np.random.randn(5, 101))
|
||||
df.info(buf=io)
|
||||
|
||||
io = StringIO()
|
||||
df.info(buf=io, max_cols=101)
|
||||
rs = io.getvalue()
|
||||
assert len(rs.splitlines()) > 100
|
||||
xp = rs
|
||||
|
||||
set_option("display.max_info_columns", 101)
|
||||
io = StringIO()
|
||||
df.info(buf=io)
|
||||
assert rs == xp
|
||||
reset_option("display.max_info_columns")
|
||||
|
||||
def test_info_duplicate_columns(self):
|
||||
io = StringIO()
|
||||
|
||||
# it works!
|
||||
frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
|
||||
frame.info(buf=io)
|
||||
|
||||
def test_info_duplicate_columns_shows_correct_dtypes(self):
|
||||
# GH11761
|
||||
io = StringIO()
|
||||
|
||||
frame = DataFrame([[1, 2.0]], columns=["a", "a"])
|
||||
frame.info(buf=io)
|
||||
io.seek(0)
|
||||
lines = io.readlines()
|
||||
assert "a 1 non-null int64\n" == lines[3]
|
||||
assert "a 1 non-null float64\n" == lines[4]
|
||||
|
||||
def test_info_shows_column_dtypes(self):
|
||||
dtypes = [
|
||||
"int64",
|
||||
"float64",
|
||||
"datetime64[ns]",
|
||||
"timedelta64[ns]",
|
||||
"complex128",
|
||||
"object",
|
||||
"bool",
|
||||
]
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
res = buf.getvalue()
|
||||
for i, dtype in enumerate(dtypes):
|
||||
name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype)
|
||||
assert name in res
|
||||
|
||||
def test_info_max_cols(self):
|
||||
df = DataFrame(np.random.randn(10, 5))
|
||||
for len_, verbose in [(5, None), (5, False), (10, True)]:
|
||||
# For verbose always ^ setting ^ summarize ^ full output
|
||||
with option_context("max_info_columns", 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
for len_, verbose in [(10, None), (5, False), (10, True)]:
|
||||
|
||||
# max_cols no exceeded
|
||||
with option_context("max_info_columns", 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, verbose=verbose)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
for len_, max_cols in [(10, 5), (5, 4)]:
|
||||
# setting truncates
|
||||
with option_context("max_info_columns", 4):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
# setting wouldn't truncate
|
||||
with option_context("max_info_columns", 5):
|
||||
buf = StringIO()
|
||||
df.info(buf=buf, max_cols=max_cols)
|
||||
res = buf.getvalue()
|
||||
assert len(res.strip().split("\n")) == len_
|
||||
|
||||
def test_info_memory_usage(self):
|
||||
# Ensure memory usage is displayed, when asserted, on the last line
|
||||
dtypes = [
|
||||
"int64",
|
||||
"float64",
|
||||
"datetime64[ns]",
|
||||
"timedelta64[ns]",
|
||||
"complex128",
|
||||
"object",
|
||||
"bool",
|
||||
]
|
||||
data = {}
|
||||
n = 10
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
buf = StringIO()
|
||||
|
||||
# display memory usage case
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " in res[-1]
|
||||
|
||||
# do not display memory usage case
|
||||
df.info(buf=buf, memory_usage=False)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert "memory usage: " not in res[-1]
|
||||
|
||||
df.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# memory usage is a lower bound, so print it as XYZ+ MB
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df.iloc[:, :5].info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
|
||||
# excluded column with object dtype, so estimate is accurate
|
||||
assert not re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
# Test a DataFrame with duplicate columns
|
||||
dtypes = ["int64", "int64", "int64", "float64"]
|
||||
data = {}
|
||||
n = 100
|
||||
for i, dtype in enumerate(dtypes):
|
||||
data[i] = np.random.randint(2, size=n).astype(dtype)
|
||||
df = DataFrame(data)
|
||||
df.columns = dtypes
|
||||
|
||||
df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
|
||||
df_with_object_index.info(buf=buf, memory_usage=True)
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+\+", res[-1])
|
||||
|
||||
df_with_object_index.info(buf=buf, memory_usage="deep")
|
||||
res = buf.getvalue().splitlines()
|
||||
assert re.match(r"memory usage: [^+]+$", res[-1])
|
||||
|
||||
# Ensure df size is as expected
|
||||
# (cols * rows * bytes) + index size
|
||||
df_size = df.memory_usage().sum()
|
||||
exp_size = len(dtypes) * n * 8 + df.index.nbytes
|
||||
assert df_size == exp_size
|
||||
|
||||
# Ensure number of cols in memory_usage is the same as df
|
||||
size_df = np.size(df.columns.values) + 1 # index=True; default
|
||||
assert size_df == np.size(df.memory_usage())
|
||||
|
||||
# assert deep works only on object
|
||||
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
|
||||
|
||||
# test for validity
|
||||
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
|
||||
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
|
||||
df = DataFrame(
|
||||
data=1,
|
||||
index=pd.MultiIndex.from_product([["a"], range(1000)]),
|
||||
columns=["A"],
|
||||
)
|
||||
df.index.nbytes
|
||||
df.memory_usage(index=True)
|
||||
df.index.values.nbytes
|
||||
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
assert mem > 0
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
|
||||
def test_info_memory_usage_deep_not_pypy(self):
|
||||
df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
|
||||
assert (
|
||||
df_with_object_index.memory_usage(index=True, deep=True).sum()
|
||||
> df_with_object_index.memory_usage(index=True).sum()
|
||||
)
|
||||
|
||||
df_object = pd.DataFrame({"a": ["a"]})
|
||||
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
|
||||
|
||||
@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
|
||||
def test_info_memory_usage_deep_pypy(self):
|
||||
df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
|
||||
assert (
|
||||
df_with_object_index.memory_usage(index=True, deep=True).sum()
|
||||
== df_with_object_index.memory_usage(index=True).sum()
|
||||
)
|
||||
|
||||
df_object = pd.DataFrame({"a": ["a"]})
|
||||
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
|
||||
def test_usage_via_getsizeof(self):
|
||||
df = DataFrame(
|
||||
data=1,
|
||||
index=pd.MultiIndex.from_product([["a"], range(1000)]),
|
||||
columns=["A"],
|
||||
)
|
||||
mem = df.memory_usage(deep=True).sum()
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = mem - sys.getsizeof(df)
|
||||
assert abs(diff) < 100
|
||||
|
||||
def test_info_memory_usage_qualified(self):
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
|
||||
df.info(buf=buf)
|
||||
assert "+" not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
|
||||
df.info(buf=buf)
|
||||
assert "+" in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(
|
||||
1,
|
||||
columns=list("ab"),
|
||||
index=pd.MultiIndex.from_product([range(3), range(3)]),
|
||||
)
|
||||
df.info(buf=buf)
|
||||
assert "+" not in buf.getvalue()
|
||||
|
||||
buf = StringIO()
|
||||
df = DataFrame(
|
||||
1,
|
||||
columns=list("ab"),
|
||||
index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]),
|
||||
)
|
||||
df.info(buf=buf)
|
||||
assert "+" in buf.getvalue()
|
||||
|
||||
def test_info_memory_usage_bug_on_multiindex(self):
|
||||
# GH 14308
|
||||
# memory usage introspection should not materialize .values
|
||||
|
||||
from string import ascii_uppercase as uppercase
|
||||
|
||||
def memory_usage(f):
|
||||
return f.memory_usage(deep=True).sum()
|
||||
|
||||
N = 100
|
||||
M = len(uppercase)
|
||||
index = pd.MultiIndex.from_product(
|
||||
[list(uppercase), pd.date_range("20160101", periods=N)],
|
||||
names=["id", "date"],
|
||||
)
|
||||
df = DataFrame({"value": np.random.randn(N * M)}, index=index)
|
||||
|
||||
unstacked = df.unstack("id")
|
||||
assert df.values.nbytes == unstacked.values.nbytes
|
||||
assert memory_usage(df) > memory_usage(unstacked)
|
||||
|
||||
# high upper bound
|
||||
assert memory_usage(unstacked) - memory_usage(df) < 2000
|
||||
|
||||
def test_info_categorical(self):
|
||||
# GH14298
|
||||
idx = pd.CategoricalIndex(["a", "b"])
|
||||
df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
|
||||
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
def test_info_categorical_column(self):
|
||||
|
||||
# make sure it works
|
||||
n = 2500
|
||||
df = DataFrame({"int64": np.random.randint(100, size=n)})
|
||||
df["category"] = Series(
|
||||
np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
|
||||
).astype("category")
|
||||
df.isna()
|
||||
buf = StringIO()
|
||||
df.info(buf=buf)
|
||||
|
||||
df2 = df[df["category"] == "d"]
|
||||
buf = StringIO()
|
||||
df2.info(buf=buf)
|
||||
|
||||
def test_repr_categorical_dates_periods(self):
|
||||
# normal DataFrame
|
||||
dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
|
||||
p = period_range("2011-01", freq="M", periods=5)
|
||||
df = DataFrame({"dt": dt, "p": p})
|
||||
exp = """ dt p
|
||||
0 2011-01-01 09:00:00-05:00 2011-01
|
||||
1 2011-01-01 10:00:00-05:00 2011-02
|
||||
2 2011-01-01 11:00:00-05:00 2011-03
|
||||
3 2011-01-01 12:00:00-05:00 2011-04
|
||||
4 2011-01-01 13:00:00-05:00 2011-05"""
|
||||
|
||||
assert repr(df) == exp
|
||||
|
||||
df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
|
||||
assert repr(df2) == exp
|
||||
|
||||
@pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
|
||||
@pytest.mark.parametrize(
|
||||
"box, expected",
|
||||
[[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]],
|
||||
)
|
||||
def test_repr_np_nat_with_object(self, arg, box, expected):
|
||||
# GH 25445
|
||||
result = repr(box([arg("NaT")], dtype=object))
|
||||
assert result == expected
|
||||
1105
venv/lib/python3.6/site-packages/pandas/tests/frame/test_reshape.py
Normal file
1105
venv/lib/python3.6/site-packages/pandas/tests/frame/test_reshape.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,93 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util import testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_none():
|
||||
return DataFrame(
|
||||
{
|
||||
"outer": ["a", "a", "a", "b", "b", "b"],
|
||||
"inner": [1, 2, 2, 2, 1, 1],
|
||||
"A": np.arange(6, 0, -1),
|
||||
("B", 5): ["one", "one", "two", "two", "one", "one"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=[["outer"], ["outer", "inner"]])
|
||||
def df_idx(request, df_none):
|
||||
levels = request.param
|
||||
return df_none.set_index(levels)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
"inner", # index level
|
||||
["outer"], # list of index level
|
||||
"A", # column
|
||||
[("B", 5)], # list of column
|
||||
["inner", "outer"], # two index levels
|
||||
[("B", 5), "outer"], # index level and column
|
||||
["A", ("B", 5)], # Two columns
|
||||
["inner", "outer"], # two index levels and column
|
||||
]
|
||||
)
|
||||
def sort_names(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def ascending(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending):
|
||||
|
||||
# GH 14353
|
||||
|
||||
# Get index levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on columns and the setting index
|
||||
expected = df_none.sort_values(
|
||||
by=sort_names, ascending=ascending, axis=0
|
||||
).set_index(levels)
|
||||
|
||||
# Compute result sorting on mix on columns and index levels
|
||||
result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending):
|
||||
|
||||
# GH 14353
|
||||
|
||||
# Get levels from df_idx
|
||||
levels = df_idx.index.names
|
||||
|
||||
# Compute expected by sorting on axis=0, setting index levels, and then
|
||||
# transposing. For some cases this will result in a frame with
|
||||
# multiple column levels
|
||||
expected = (
|
||||
df_none.sort_values(by=sort_names, ascending=ascending, axis=0)
|
||||
.set_index(levels)
|
||||
.T
|
||||
)
|
||||
|
||||
# Compute result by transposing and sorting on axis=1.
|
||||
result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1)
|
||||
|
||||
if len(levels) > 1:
|
||||
# Accessing multi-level columns that are not lexsorted raises a
|
||||
# performance warning
|
||||
with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False):
|
||||
assert_frame_equal(result, expected)
|
||||
else:
|
||||
assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,739 @@
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
NaT,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
from pandas.api.types import CategoricalDtype
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameSorting(TestData):
|
||||
def test_sort_values(self):
|
||||
frame = DataFrame(
|
||||
[[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC")
|
||||
)
|
||||
|
||||
# by column (axis=0)
|
||||
sorted_df = frame.sort_values(by="A")
|
||||
indexer = frame["A"].argsort().values
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by="A", ascending=False)
|
||||
indexer = indexer[::-1]
|
||||
expected = frame.loc[frame.index[indexer]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by="A", ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# GH4839
|
||||
sorted_df = frame.sort_values(by=["A"], ascending=[False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# multiple bys
|
||||
sorted_df = frame.sort_values(by=["B", "C"])
|
||||
expected = frame.loc[[2, 1, 3]]
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
|
||||
assert_frame_equal(sorted_df, expected[::-1])
|
||||
|
||||
sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
msg = "No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
frame.sort_values(by=["A", "B"], axis=2, inplace=True)
|
||||
|
||||
# by row (axis=1): GH 10806
|
||||
sorted_df = frame.sort_values(by=3, axis=1)
|
||||
expected = frame
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=["C", "B", "A"])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 2], axis="columns")
|
||||
expected = frame.reindex(columns=["B", "A", "C"])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
|
||||
expected = frame.reindex(columns=["C", "B", "A"])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
msg = r"Length of ascending \(5\) != length of by \(2\)"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
|
||||
|
||||
def test_sort_values_inplace(self):
|
||||
frame = DataFrame(
|
||||
np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]
|
||||
)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by="A", inplace=True)
|
||||
expected = frame.sort_values(by="A")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by=1, axis=1, inplace=True)
|
||||
expected = frame.sort_values(by=1, axis=1)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by="A", ascending=False, inplace=True)
|
||||
expected = frame.sort_values(by="A", ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
sorted_df = frame.copy()
|
||||
sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True)
|
||||
expected = frame.sort_values(by=["A", "B"], ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_nan(self):
|
||||
# GH3917
|
||||
nan = np.nan
|
||||
df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
|
||||
|
||||
# sort one column only
|
||||
expected = DataFrame(
|
||||
{"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5],
|
||||
)
|
||||
sorted_df = df.sort_values(["A"], na_position="first")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3],
|
||||
)
|
||||
sorted_df = df.sort_values(["A"], na_position="first", ascending=False)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = df.reindex(columns=["B", "A"])
|
||||
sorted_df = df.sort_values(by=1, axis=1, na_position="first")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', order
|
||||
expected = DataFrame(
|
||||
{"A": [1, 1, 2, 4, 6, 8, nan], "B": [2, 9, nan, 5, 5, 4, 5]},
|
||||
index=[3, 0, 1, 6, 4, 5, 2],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"])
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', order
|
||||
expected = DataFrame(
|
||||
{"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, nan, 5, 5, 4]},
|
||||
index=[2, 3, 0, 1, 6, 4, 5],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"], na_position="first")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='first', not order
|
||||
expected = DataFrame(
|
||||
{"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]},
|
||||
index=[2, 0, 3, 1, 6, 4, 5],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# na_position='last', not order
|
||||
expected = DataFrame(
|
||||
{"A": [8, 6, 4, 2, 1, 1, nan], "B": [4, 5, 5, nan, 2, 9, 5]},
|
||||
index=[5, 4, 6, 1, 3, 0, 2],
|
||||
)
|
||||
sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# Test DataFrame with nan label
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]},
|
||||
index=[1, 2, 3, 4, 5, 6, nan],
|
||||
)
|
||||
|
||||
# NaN label, ascending=True, na_position='last'
|
||||
sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last")
|
||||
expected = DataFrame(
|
||||
{"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]},
|
||||
index=[1, 2, 3, 4, 5, 6, nan],
|
||||
)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=True, na_position='first'
|
||||
sorted_df = df.sort_index(na_position="first")
|
||||
expected = DataFrame(
|
||||
{"A": [4, 1, 2, nan, 1, 6, 8], "B": [5, 9, nan, 5, 2, 5, 4]},
|
||||
index=[nan, 1, 2, 3, 4, 5, 6],
|
||||
)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=False, na_position='last'
|
||||
sorted_df = df.sort_index(kind="quicksort", ascending=False)
|
||||
expected = DataFrame(
|
||||
{"A": [8, 6, 1, nan, 2, 1, 4], "B": [4, 5, 2, 5, nan, 9, 5]},
|
||||
index=[6, 5, 4, 3, 2, 1, nan],
|
||||
)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
# NaN label, ascending=False, na_position='first'
|
||||
sorted_df = df.sort_index(
|
||||
kind="quicksort", ascending=False, na_position="first"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"A": [4, 8, 6, 1, nan, 2, 1], "B": [5, 4, 5, 2, 5, nan, 9]},
|
||||
index=[nan, 6, 5, 4, 3, 2, 1],
|
||||
)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_stable_descending_sort(self):
|
||||
# GH #6399
|
||||
df = DataFrame(
|
||||
[[2, "first"], [2, "second"], [1, "a"], [1, "b"]],
|
||||
columns=["sort_col", "order"],
|
||||
)
|
||||
sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
|
||||
assert_frame_equal(df, sorted_df)
|
||||
|
||||
def test_stable_descending_multicolumn_sort(self):
|
||||
nan = np.nan
|
||||
df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
|
||||
# test stable mergesort
|
||||
expected = DataFrame(
|
||||
{"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]},
|
||||
index=[2, 5, 4, 6, 1, 3, 0],
|
||||
)
|
||||
sorted_df = df.sort_values(
|
||||
["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort"
|
||||
)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
{"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]},
|
||||
index=[2, 5, 4, 6, 1, 0, 3],
|
||||
)
|
||||
sorted_df = df.sort_values(
|
||||
["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort"
|
||||
)
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_multi_index(self):
|
||||
# GH 25775, testing that sorting by index works with a multi-index.
|
||||
df = DataFrame(
|
||||
{"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")}
|
||||
)
|
||||
result = df.set_index(list("abc")).sort_index(level=list("ba"))
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")}
|
||||
)
|
||||
expected = expected.set_index(list("abc"))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_stable_categorial(self):
|
||||
# GH 16793
|
||||
df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)})
|
||||
expected = df.copy()
|
||||
sorted_df = df.sort_values("x", kind="mergesort")
|
||||
assert_frame_equal(sorted_df, expected)
|
||||
|
||||
def test_sort_datetimes(self):
|
||||
|
||||
# GH 3461, argsort / lexsort differences for a datetime column
|
||||
df = DataFrame(
|
||||
["a", "a", "a", "b", "c", "d", "e", "f", "g"],
|
||||
columns=["A"],
|
||||
index=date_range("20130101", periods=9),
|
||||
)
|
||||
dts = [
|
||||
Timestamp(x)
|
||||
for x in [
|
||||
"2004-02-11",
|
||||
"2004-01-21",
|
||||
"2004-01-26",
|
||||
"2005-09-20",
|
||||
"2010-10-04",
|
||||
"2009-05-12",
|
||||
"2008-11-12",
|
||||
"2010-09-28",
|
||||
"2010-09-28",
|
||||
]
|
||||
]
|
||||
df["B"] = dts[::2] + dts[1::2]
|
||||
df["C"] = 2.0
|
||||
df["A1"] = 3.0
|
||||
|
||||
df1 = df.sort_values(by="A")
|
||||
df2 = df.sort_values(by=["A"])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by="B")
|
||||
df2 = df.sort_values(by=["B"])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
df1 = df.sort_values(by="B")
|
||||
|
||||
df2 = df.sort_values(by=["C", "B"])
|
||||
assert_frame_equal(df1, df2)
|
||||
|
||||
def test_frame_column_inplace_sort_exception(self):
|
||||
s = self.frame["A"]
|
||||
with pytest.raises(ValueError, match="This Series is a view"):
|
||||
s.sort_values(inplace=True)
|
||||
|
||||
cp = s.copy()
|
||||
cp.sort_values() # it works!
|
||||
|
||||
def test_sort_nat_values_in_int_column(self):
|
||||
|
||||
# GH 14922: "sorting with large float and multiple columns incorrect"
|
||||
|
||||
# cause was that the int64 value NaT was considered as "na". Which is
|
||||
# only correct for datetime64 columns.
|
||||
|
||||
int_values = (2, int(NaT))
|
||||
float_values = (2.0, -1.797693e308)
|
||||
|
||||
df = DataFrame(
|
||||
dict(int=int_values, float=float_values), columns=["int", "float"]
|
||||
)
|
||||
|
||||
df_reversed = DataFrame(
|
||||
dict(int=int_values[::-1], float=float_values[::-1]),
|
||||
columns=["int", "float"],
|
||||
index=[1, 0],
|
||||
)
|
||||
|
||||
# NaT is not a "na" for int64 columns, so na_position must not
|
||||
# influence the result:
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="last")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["int", "float"], na_position="first")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
# reverse sorting order
|
||||
df_sorted = df.sort_values(["int", "float"], ascending=False)
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
# and now check if NaT is still considered as "na" for datetime64
|
||||
# columns:
|
||||
df = DataFrame(
|
||||
dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values),
|
||||
columns=["datetime", "float"],
|
||||
)
|
||||
|
||||
df_reversed = DataFrame(
|
||||
dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]),
|
||||
columns=["datetime", "float"],
|
||||
index=[1, 0],
|
||||
)
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
|
||||
assert_frame_equal(df_sorted, df_reversed)
|
||||
|
||||
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
# Ascending should not affect the results.
|
||||
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
|
||||
assert_frame_equal(df_sorted, df)
|
||||
|
||||
def test_sort_nat(self):
|
||||
|
||||
# GH 16836
|
||||
|
||||
d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]]
|
||||
d2 = [
|
||||
Timestamp(x)
|
||||
for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"]
|
||||
]
|
||||
df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3])
|
||||
|
||||
d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]]
|
||||
d4 = [
|
||||
Timestamp(x)
|
||||
for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"]
|
||||
]
|
||||
expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2])
|
||||
sorted_df = df.sort_values(by=["a", "b"])
|
||||
tm.assert_frame_equal(sorted_df, expected)
|
||||
|
||||
|
||||
class TestDataFrameSortIndexKinds(TestData):
|
||||
def test_sort_index_multicolumn(self):
|
||||
A = np.arange(5).repeat(20)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
random.shuffle(A)
|
||||
random.shuffle(B)
|
||||
frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=["A", "B"])
|
||||
result = frame.sort_values(by=["A", "B"])
|
||||
indexer = np.lexsort((frame["B"], frame["A"]))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=["A", "B"], ascending=False)
|
||||
result = frame.sort_values(by=["A", "B"], ascending=False)
|
||||
indexer = np.lexsort(
|
||||
(frame["B"].rank(ascending=False), frame["A"].rank(ascending=False))
|
||||
)
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
frame.sort_index(by=["B", "A"])
|
||||
result = frame.sort_values(by=["B", "A"])
|
||||
indexer = np.lexsort((frame["A"], frame["B"]))
|
||||
expected = frame.take(indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_inplace(self):
|
||||
frame = DataFrame(
|
||||
np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]
|
||||
)
|
||||
|
||||
# axis=0
|
||||
unordered = frame.loc[[3, 2, 4, 1]]
|
||||
a_id = id(unordered["A"])
|
||||
df = unordered.copy()
|
||||
df.sort_index(inplace=True)
|
||||
expected = frame
|
||||
assert_frame_equal(df, expected)
|
||||
assert a_id != id(df["A"])
|
||||
|
||||
df = unordered.copy()
|
||||
df.sort_index(ascending=False, inplace=True)
|
||||
expected = frame[::-1]
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
# axis=1
|
||||
unordered = frame.loc[:, ["D", "B", "C", "A"]]
|
||||
df = unordered.copy()
|
||||
df.sort_index(axis=1, inplace=True)
|
||||
expected = frame
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
df = unordered.copy()
|
||||
df.sort_index(axis=1, ascending=False, inplace=True)
|
||||
expected = frame.iloc[:, ::-1]
|
||||
assert_frame_equal(df, expected)
|
||||
|
||||
def test_sort_index_different_sortorder(self):
|
||||
A = np.arange(20).repeat(5)
|
||||
B = np.tile(np.arange(5), 20)
|
||||
|
||||
indexer = np.random.permutation(100)
|
||||
A = A.take(indexer)
|
||||
B = B.take(indexer)
|
||||
|
||||
df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=["A", "B"], ascending=[1, 0])
|
||||
result = df.sort_values(by=["A", "B"], ascending=[1, 0])
|
||||
|
||||
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
|
||||
expected = df.take(ex_indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# test with multiindex, too
|
||||
idf = df.set_index(["A", "B"])
|
||||
|
||||
result = idf.sort_index(ascending=[1, 0])
|
||||
expected = idf.take(ex_indexer)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# also, Series!
|
||||
result = idf["C"].sort_index(ascending=[1, 0])
|
||||
assert_series_equal(result, expected["C"])
|
||||
|
||||
def test_sort_index_duplicates(self):
|
||||
|
||||
# with 9816, these are all translated to .sort_values
|
||||
|
||||
df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"])
|
||||
|
||||
with pytest.raises(ValueError, match="not unique"):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by="a")
|
||||
with pytest.raises(ValueError, match="not unique"):
|
||||
df.sort_values(by="a")
|
||||
|
||||
with pytest.raises(ValueError, match="not unique"):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=["a"])
|
||||
with pytest.raises(ValueError, match="not unique"):
|
||||
df.sort_values(by=["a"])
|
||||
|
||||
with pytest.raises(ValueError, match="not unique"):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
# multi-column 'by' is separate codepath
|
||||
df.sort_index(by=["a", "b"])
|
||||
with pytest.raises(ValueError, match="not unique"):
|
||||
# multi-column 'by' is separate codepath
|
||||
df.sort_values(by=["a", "b"])
|
||||
|
||||
# with multi-index
|
||||
# GH4370
|
||||
df = DataFrame(
|
||||
np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)])
|
||||
)
|
||||
with pytest.raises(ValueError, match="level"):
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by="a")
|
||||
with pytest.raises(ValueError, match="level"):
|
||||
df.sort_values(by="a")
|
||||
|
||||
# convert tuples to a list of tuples
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=[("a", 1)])
|
||||
expected = df.sort_values(by=[("a", 1)])
|
||||
|
||||
# use .sort_values #9816
|
||||
with tm.assert_produces_warning(FutureWarning):
|
||||
df.sort_index(by=("a", 1))
|
||||
result = df.sort_values(by=("a", 1))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_level(self):
|
||||
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
|
||||
df = DataFrame([[1, 2], [3, 4]], mi)
|
||||
|
||||
result = df.sort_index(level="A", sort_remaining=False)
|
||||
expected = df
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_index(level=["A", "B"], sort_remaining=False)
|
||||
expected = df
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# Error thrown by sort_index when
|
||||
# first index is sorted last (#26053)
|
||||
result = df.sort_index(level=["C", "B", "A"])
|
||||
expected = df.iloc[[1, 0]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_index(level=["B", "C", "A"])
|
||||
expected = df.iloc[[1, 0]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_index(level=["C", "A"])
|
||||
expected = df.iloc[[1, 0]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_categorical_index(self):
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": np.arange(6, dtype="int64"),
|
||||
"B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))),
|
||||
}
|
||||
).set_index("B")
|
||||
|
||||
result = df.sort_index()
|
||||
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = df.sort_index(ascending=False)
|
||||
expected = df.iloc[[2, 3, 0, 1, 5, 4]]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index(self):
|
||||
# GH13496
|
||||
|
||||
frame = DataFrame(
|
||||
np.arange(16).reshape(4, 4),
|
||||
index=[1, 2, 3, 4],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
|
||||
# axis=0 : sort rows by index labels
|
||||
unordered = frame.loc[[3, 2, 4, 1]]
|
||||
result = unordered.sort_index(axis=0)
|
||||
expected = frame
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = unordered.sort_index(ascending=False)
|
||||
expected = frame[::-1]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# axis=1 : sort columns by column names
|
||||
unordered = frame.iloc[:, [2, 1, 3, 0]]
|
||||
result = unordered.sort_index(axis=1)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
result = unordered.sort_index(axis=1, ascending=False)
|
||||
expected = frame.iloc[:, ::-1]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("level", ["A", 0]) # GH 21052
|
||||
def test_sort_index_multiindex(self, level):
|
||||
# GH13496
|
||||
|
||||
# sort rows by specified level of multi-index
|
||||
mi = MultiIndex.from_tuples(
|
||||
[[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC")
|
||||
)
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
|
||||
|
||||
expected_mi = MultiIndex.from_tuples(
|
||||
[[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC")
|
||||
)
|
||||
expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi)
|
||||
result = df.sort_index(level=level)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort_remaining=False
|
||||
expected_mi = MultiIndex.from_tuples(
|
||||
[[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC")
|
||||
)
|
||||
expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi)
|
||||
result = df.sort_index(level=level, sort_remaining=False)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_intervalindex(self):
|
||||
# this is a de-facto sort via unstack
|
||||
# confirming that we sort in the order of the bins
|
||||
y = Series(np.random.randn(100))
|
||||
x1 = Series(np.sign(np.random.randn(100)))
|
||||
x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3])
|
||||
model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"])
|
||||
|
||||
result = model.groupby(["X1", "X2"], observed=True).mean().unstack()
|
||||
expected = IntervalIndex.from_tuples(
|
||||
[(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right"
|
||||
)
|
||||
result = result.columns.levels[1].categories
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_sort_index_na_position_with_categories(self):
|
||||
# GH 22556
|
||||
# Positioning missing value properly when column is Categorical.
|
||||
categories = ["A", "B", "C"]
|
||||
category_indices = [0, 2, 4]
|
||||
list_of_nans = [np.nan, np.nan]
|
||||
na_indices = [1, 3]
|
||||
na_position_first = "first"
|
||||
na_position_last = "last"
|
||||
column_name = "c"
|
||||
|
||||
reversed_categories = sorted(categories, reverse=True)
|
||||
reversed_category_indices = sorted(category_indices, reverse=True)
|
||||
reversed_na_indices = sorted(na_indices)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
column_name: pd.Categorical(
|
||||
["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True
|
||||
)
|
||||
}
|
||||
)
|
||||
# sort ascending with na first
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=True, na_position=na_position_first
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
list_of_nans + categories, categories=categories, ordered=True
|
||||
)
|
||||
},
|
||||
index=na_indices + category_indices,
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort ascending with na last
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=True, na_position=na_position_last
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
categories + list_of_nans, categories=categories, ordered=True
|
||||
)
|
||||
},
|
||||
index=category_indices + na_indices,
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort descending with na first
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=False, na_position=na_position_first
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
list_of_nans + reversed_categories,
|
||||
categories=categories,
|
||||
ordered=True,
|
||||
)
|
||||
},
|
||||
index=reversed_na_indices + reversed_category_indices,
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# sort descending with na last
|
||||
result = df.sort_values(
|
||||
by=column_name, ascending=False, na_position=na_position_last
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
column_name: Categorical(
|
||||
reversed_categories + list_of_nans,
|
||||
categories=categories,
|
||||
ordered=True,
|
||||
)
|
||||
},
|
||||
index=reversed_category_indices + reversed_na_indices,
|
||||
)
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_sort_index_na_position_with_categories_raises(self):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"c": pd.Categorical(
|
||||
["A", np.nan, "B", np.nan, "C"],
|
||||
categories=["A", "B", "C"],
|
||||
ordered=True,
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
df.sort_values(by="c", ascending=False, na_position="bad_position")
|
||||
@@ -0,0 +1,592 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestDataFrameSubclassing(TestData):
|
||||
def test_frame_subclassing_and_slicing(self):
|
||||
# Subclass frame and ensure it returns the right class on slicing it
|
||||
# In reference to PR 9632
|
||||
|
||||
class CustomSeries(Series):
|
||||
@property
|
||||
def _constructor(self):
|
||||
return CustomSeries
|
||||
|
||||
def custom_series_function(self):
|
||||
return "OK"
|
||||
|
||||
class CustomDataFrame(DataFrame):
|
||||
"""
|
||||
Subclasses pandas DF, fills DF with simulation results, adds some
|
||||
custom plotting functions.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super().__init__(*args, **kw)
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
return CustomDataFrame
|
||||
|
||||
_constructor_sliced = CustomSeries
|
||||
|
||||
def custom_frame_function(self):
|
||||
return "OK"
|
||||
|
||||
data = {"col1": range(10), "col2": range(10)}
|
||||
cdf = CustomDataFrame(data)
|
||||
|
||||
# Did we get back our own DF class?
|
||||
assert isinstance(cdf, CustomDataFrame)
|
||||
|
||||
# Do we get back our own Series class after selecting a column?
|
||||
cdf_series = cdf.col1
|
||||
assert isinstance(cdf_series, CustomSeries)
|
||||
assert cdf_series.custom_series_function() == "OK"
|
||||
|
||||
# Do we get back our own DF class after slicing row-wise?
|
||||
cdf_rows = cdf[1:5]
|
||||
assert isinstance(cdf_rows, CustomDataFrame)
|
||||
assert cdf_rows.custom_frame_function() == "OK"
|
||||
|
||||
# Make sure sliced part of multi-index frame is custom class
|
||||
mcol = pd.MultiIndex.from_tuples([("A", "A"), ("A", "B")])
|
||||
cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
|
||||
assert isinstance(cdf_multi["A"], CustomDataFrame)
|
||||
|
||||
mcol = pd.MultiIndex.from_tuples([("A", ""), ("B", "")])
|
||||
cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
|
||||
assert isinstance(cdf_multi2["A"], CustomSeries)
|
||||
|
||||
def test_dataframe_metadata(self):
|
||||
df = tm.SubclassedDataFrame(
|
||||
{"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"]
|
||||
)
|
||||
df.testattr = "XXX"
|
||||
|
||||
assert df.testattr == "XXX"
|
||||
assert df[["X"]].testattr == "XXX"
|
||||
assert df.loc[["a", "b"], :].testattr == "XXX"
|
||||
assert df.iloc[[0, 1], :].testattr == "XXX"
|
||||
|
||||
# see gh-9776
|
||||
assert df.iloc[0:1, :].testattr == "XXX"
|
||||
|
||||
# see gh-10553
|
||||
unpickled = tm.round_trip_pickle(df)
|
||||
tm.assert_frame_equal(df, unpickled)
|
||||
assert df._metadata == unpickled._metadata
|
||||
assert df.testattr == unpickled.testattr
|
||||
|
||||
def test_indexing_sliced(self):
|
||||
# GH 11559
|
||||
df = tm.SubclassedDataFrame(
|
||||
{"X": [1, 2, 3], "Y": [4, 5, 6], "Z": [7, 8, 9]}, index=["a", "b", "c"]
|
||||
)
|
||||
res = df.loc[:, "X"]
|
||||
exp = tm.SubclassedSeries([1, 2, 3], index=list("abc"), name="X")
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.iloc[:, 1]
|
||||
exp = tm.SubclassedSeries([4, 5, 6], index=list("abc"), name="Y")
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc[:, "Z"]
|
||||
exp = tm.SubclassedSeries([7, 8, 9], index=list("abc"), name="Z")
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc["a", :]
|
||||
exp = tm.SubclassedSeries([1, 4, 7], index=list("XYZ"), name="a")
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.iloc[1, :]
|
||||
exp = tm.SubclassedSeries([2, 5, 8], index=list("XYZ"), name="b")
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
res = df.loc["c", :]
|
||||
exp = tm.SubclassedSeries([3, 6, 9], index=list("XYZ"), name="c")
|
||||
tm.assert_series_equal(res, exp)
|
||||
assert isinstance(res, tm.SubclassedSeries)
|
||||
|
||||
def test_subclass_attr_err_propagation(self):
|
||||
# GH 11808
|
||||
class A(DataFrame):
|
||||
@property
|
||||
def bar(self):
|
||||
return self.i_dont_exist
|
||||
|
||||
with pytest.raises(AttributeError, match=".*i_dont_exist.*"):
|
||||
A().bar
|
||||
|
||||
def test_subclass_align(self):
|
||||
# GH 12983
|
||||
df1 = tm.SubclassedDataFrame(
|
||||
{"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")
|
||||
)
|
||||
df2 = tm.SubclassedDataFrame(
|
||||
{"c": [1, 2, 4], "d": [1, 2, 4]}, index=list("ABD")
|
||||
)
|
||||
|
||||
res1, res2 = df1.align(df2, axis=0)
|
||||
exp1 = tm.SubclassedDataFrame(
|
||||
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
|
||||
index=list("ABCDE"),
|
||||
)
|
||||
exp2 = tm.SubclassedDataFrame(
|
||||
{"c": [1, 2, np.nan, 4, np.nan], "d": [1, 2, np.nan, 4, np.nan]},
|
||||
index=list("ABCDE"),
|
||||
)
|
||||
assert isinstance(res1, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
assert isinstance(res2, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res2, exp2)
|
||||
|
||||
res1, res2 = df1.a.align(df2.c)
|
||||
assert isinstance(res1, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res1, exp1.a)
|
||||
assert isinstance(res2, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res2, exp2.c)
|
||||
|
||||
def test_subclass_align_combinations(self):
|
||||
# GH 12983
|
||||
df = tm.SubclassedDataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
|
||||
s = tm.SubclassedSeries([1, 2, 4], index=list("ABD"), name="x")
|
||||
|
||||
# frame + series
|
||||
res1, res2 = df.align(s, axis=0)
|
||||
exp1 = pd.DataFrame(
|
||||
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
|
||||
index=list("ABCDE"),
|
||||
)
|
||||
# name is lost when
|
||||
exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
|
||||
|
||||
assert isinstance(res1, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
assert isinstance(res2, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res2, exp2)
|
||||
|
||||
# series + frame
|
||||
res1, res2 = s.align(df)
|
||||
assert isinstance(res1, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(res1, exp2)
|
||||
assert isinstance(res2, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(res2, exp1)
|
||||
|
||||
def test_subclass_iterrows(self):
|
||||
# GH 13977
|
||||
df = tm.SubclassedDataFrame({"a": [1]})
|
||||
for i, row in df.iterrows():
|
||||
assert isinstance(row, tm.SubclassedSeries)
|
||||
tm.assert_series_equal(row, df.loc[i])
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
|
||||
def test_subclass_sparse_slice(self):
|
||||
rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
|
||||
ssdf = tm.SubclassedSparseDataFrame(rows)
|
||||
ssdf.testattr = "testattr"
|
||||
|
||||
tm.assert_sp_frame_equal(ssdf.loc[:2], tm.SubclassedSparseDataFrame(rows[:3]))
|
||||
tm.assert_sp_frame_equal(ssdf.iloc[:2], tm.SubclassedSparseDataFrame(rows[:2]))
|
||||
tm.assert_sp_frame_equal(ssdf[:2], tm.SubclassedSparseDataFrame(rows[:2]))
|
||||
assert ssdf.loc[:2].testattr == "testattr"
|
||||
assert ssdf.iloc[:2].testattr == "testattr"
|
||||
assert ssdf[:2].testattr == "testattr"
|
||||
|
||||
tm.assert_sp_series_equal(
|
||||
ssdf.loc[1],
|
||||
tm.SubclassedSparseSeries(rows[1]),
|
||||
check_names=False,
|
||||
check_kind=False,
|
||||
)
|
||||
tm.assert_sp_series_equal(
|
||||
ssdf.iloc[1],
|
||||
tm.SubclassedSparseSeries(rows[1]),
|
||||
check_names=False,
|
||||
check_kind=False,
|
||||
)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
|
||||
def test_subclass_sparse_transpose(self):
|
||||
ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
essdf = tm.SubclassedSparseDataFrame([[1, 4], [2, 5], [3, 6]])
|
||||
tm.assert_sp_frame_equal(ossdf.T, essdf)
|
||||
|
||||
def test_subclass_stack(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["a", "b", "c"],
|
||||
columns=["X", "Y", "Z"],
|
||||
)
|
||||
|
||||
res = df.stack()
|
||||
exp = tm.SubclassedSeries(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")]
|
||||
)
|
||||
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_subclass_stack_multi(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
[[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
|
||||
),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
|
||||
),
|
||||
)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 12],
|
||||
[11, 13],
|
||||
[20, 22],
|
||||
[21, 23],
|
||||
[30, 32],
|
||||
[31, 33],
|
||||
[40, 42],
|
||||
[41, 43],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))),
|
||||
names=["aaa", "ccc", "yyy"],
|
||||
),
|
||||
columns=Index(["W", "X"], name="www"),
|
||||
)
|
||||
|
||||
res = df.stack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.stack("yyy")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 11],
|
||||
[12, 13],
|
||||
[20, 21],
|
||||
[22, 23],
|
||||
[30, 31],
|
||||
[32, 33],
|
||||
[40, 41],
|
||||
[42, 43],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))),
|
||||
names=["aaa", "ccc", "www"],
|
||||
),
|
||||
columns=Index(["y", "z"], name="yyy"),
|
||||
)
|
||||
|
||||
res = df.stack("www")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_stack_multi_mixed(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 11, 12.0, 13.0],
|
||||
[20, 21, 22.0, 23.0],
|
||||
[30, 31, 32.0, 33.0],
|
||||
[40, 41, 42.0, 43.0],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
|
||||
),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
|
||||
),
|
||||
)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 12.0],
|
||||
[11, 13.0],
|
||||
[20, 22.0],
|
||||
[21, 23.0],
|
||||
[30, 32.0],
|
||||
[31, 33.0],
|
||||
[40, 42.0],
|
||||
[41, 43.0],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))),
|
||||
names=["aaa", "ccc", "yyy"],
|
||||
),
|
||||
columns=Index(["W", "X"], name="www"),
|
||||
)
|
||||
|
||||
res = df.stack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.stack("yyy")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10.0, 11.0],
|
||||
[12.0, 13.0],
|
||||
[20.0, 21.0],
|
||||
[22.0, 23.0],
|
||||
[30.0, 31.0],
|
||||
[32.0, 33.0],
|
||||
[40.0, 41.0],
|
||||
[42.0, 43.0],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))),
|
||||
names=["aaa", "ccc", "www"],
|
||||
),
|
||||
columns=Index(["y", "z"], name="yyy"),
|
||||
)
|
||||
|
||||
res = df.stack("www")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["a", "b", "c"],
|
||||
columns=["X", "Y", "Z"],
|
||||
)
|
||||
|
||||
res = df.unstack()
|
||||
exp = tm.SubclassedSeries(
|
||||
[1, 4, 7, 2, 5, 8, 3, 6, 9], index=[list("XXXYYYZZZ"), list("abcabcabc")]
|
||||
)
|
||||
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack_multi(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
[[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
|
||||
),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
|
||||
),
|
||||
)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[[10, 20, 11, 21, 12, 22, 13, 23], [30, 40, 31, 41, 32, 42, 33, 43]],
|
||||
index=Index(["A", "B"], name="aaa"),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))),
|
||||
names=["www", "yyy", "ccc"],
|
||||
),
|
||||
)
|
||||
|
||||
res = df.unstack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.unstack("ccc")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[[10, 30, 11, 31, 12, 32, 13, 33], [20, 40, 21, 41, 22, 42, 23, 43]],
|
||||
index=Index(["c", "d"], name="ccc"),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))),
|
||||
names=["www", "yyy", "aaa"],
|
||||
),
|
||||
)
|
||||
|
||||
res = df.unstack("aaa")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_unstack_multi_mixed(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 11, 12.0, 13.0],
|
||||
[20, 21, 22.0, 23.0],
|
||||
[30, 31, 32.0, 33.0],
|
||||
[40, 41, 42.0, 43.0],
|
||||
],
|
||||
index=MultiIndex.from_tuples(
|
||||
list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
|
||||
),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
|
||||
),
|
||||
)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
|
||||
[30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0],
|
||||
],
|
||||
index=Index(["A", "B"], name="aaa"),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))),
|
||||
names=["www", "yyy", "ccc"],
|
||||
),
|
||||
)
|
||||
|
||||
res = df.unstack()
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
res = df.unstack("ccc")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
exp = tm.SubclassedDataFrame(
|
||||
[
|
||||
[10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
|
||||
[20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0],
|
||||
],
|
||||
index=Index(["c", "d"], name="ccc"),
|
||||
columns=MultiIndex.from_tuples(
|
||||
list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))),
|
||||
names=["www", "yyy", "aaa"],
|
||||
),
|
||||
)
|
||||
|
||||
res = df.unstack("aaa")
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_subclass_pivot(self):
|
||||
# GH 15564
|
||||
df = tm.SubclassedDataFrame(
|
||||
{
|
||||
"index": ["A", "B", "C", "C", "B", "A"],
|
||||
"columns": ["One", "One", "One", "Two", "Two", "Two"],
|
||||
"values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
|
||||
}
|
||||
)
|
||||
|
||||
pivoted = df.pivot(index="index", columns="columns", values="values")
|
||||
|
||||
expected = tm.SubclassedDataFrame(
|
||||
{
|
||||
"One": {"A": 1.0, "B": 2.0, "C": 3.0},
|
||||
"Two": {"A": 1.0, "B": 2.0, "C": 3.0},
|
||||
}
|
||||
)
|
||||
|
||||
expected.index.name, expected.columns.name = "index", "columns"
|
||||
|
||||
tm.assert_frame_equal(pivoted, expected)
|
||||
|
||||
def test_subclassed_melt(self):
|
||||
# GH 15564
|
||||
cheese = tm.SubclassedDataFrame(
|
||||
{
|
||||
"first": ["John", "Mary"],
|
||||
"last": ["Doe", "Bo"],
|
||||
"height": [5.5, 6.0],
|
||||
"weight": [130, 150],
|
||||
}
|
||||
)
|
||||
|
||||
melted = pd.melt(cheese, id_vars=["first", "last"])
|
||||
|
||||
expected = tm.SubclassedDataFrame(
|
||||
[
|
||||
["John", "Doe", "height", 5.5],
|
||||
["Mary", "Bo", "height", 6.0],
|
||||
["John", "Doe", "weight", 130],
|
||||
["Mary", "Bo", "weight", 150],
|
||||
],
|
||||
columns=["first", "last", "variable", "value"],
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(melted, expected)
|
||||
|
||||
def test_subclassed_wide_to_long(self):
|
||||
# GH 9762
|
||||
|
||||
np.random.seed(123)
|
||||
x = np.random.randn(3)
|
||||
df = tm.SubclassedDataFrame(
|
||||
{
|
||||
"A1970": {0: "a", 1: "b", 2: "c"},
|
||||
"A1980": {0: "d", 1: "e", 2: "f"},
|
||||
"B1970": {0: 2.5, 1: 1.2, 2: 0.7},
|
||||
"B1980": {0: 3.2, 1: 1.3, 2: 0.1},
|
||||
"X": dict(zip(range(3), x)),
|
||||
}
|
||||
)
|
||||
|
||||
df["id"] = df.index
|
||||
exp_data = {
|
||||
"X": x.tolist() + x.tolist(),
|
||||
"A": ["a", "b", "c", "d", "e", "f"],
|
||||
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||||
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||||
"id": [0, 1, 2, 0, 1, 2],
|
||||
}
|
||||
expected = tm.SubclassedDataFrame(exp_data)
|
||||
expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
|
||||
long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
|
||||
|
||||
tm.assert_frame_equal(long_frame, expected)
|
||||
|
||||
def test_subclassed_apply(self):
|
||||
# GH 19822
|
||||
|
||||
def check_row_subclass(row):
|
||||
assert isinstance(row, tm.SubclassedSeries)
|
||||
|
||||
def strech(row):
|
||||
if row["variable"] == "height":
|
||||
row["value"] += 0.5
|
||||
return row
|
||||
|
||||
df = tm.SubclassedDataFrame(
|
||||
[
|
||||
["John", "Doe", "height", 5.5],
|
||||
["Mary", "Bo", "height", 6.0],
|
||||
["John", "Doe", "weight", 130],
|
||||
["Mary", "Bo", "weight", 150],
|
||||
],
|
||||
columns=["first", "last", "variable", "value"],
|
||||
)
|
||||
|
||||
df.apply(lambda x: check_row_subclass(x))
|
||||
df.apply(lambda x: check_row_subclass(x), axis=1)
|
||||
|
||||
expected = tm.SubclassedDataFrame(
|
||||
[
|
||||
["John", "Doe", "height", 6.0],
|
||||
["Mary", "Bo", "height", 6.5],
|
||||
["John", "Doe", "weight", 130],
|
||||
["Mary", "Bo", "weight", 150],
|
||||
],
|
||||
columns=["first", "last", "variable", "value"],
|
||||
)
|
||||
|
||||
result = df.apply(lambda x: strech(x), axis=1)
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = tm.SubclassedDataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
|
||||
|
||||
result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
|
||||
assert isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = tm.SubclassedSeries([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
|
||||
|
||||
result = df.apply(lambda x: [1, 2, 3], axis=1)
|
||||
assert not isinstance(result, tm.SubclassedDataFrame)
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,969 @@
|
||||
from datetime import datetime, time
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
from pandas.tests.frame.common import TestData
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_frame_equal,
|
||||
assert_index_equal,
|
||||
assert_series_equal,
|
||||
)
|
||||
|
||||
import pandas.tseries.offsets as offsets
|
||||
|
||||
|
||||
@pytest.fixture(params=product([True, False], [True, False]))
|
||||
def close_open_fixture(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestDataFrameTimeSeriesMethods(TestData):
|
||||
def test_diff(self):
|
||||
the_diff = self.tsframe.diff(1)
|
||||
|
||||
assert_series_equal(
|
||||
the_diff["A"], self.tsframe["A"] - self.tsframe["A"].shift(1)
|
||||
)
|
||||
|
||||
# int dtype
|
||||
a = 10000000000000000
|
||||
b = a + 1
|
||||
s = Series([a, b])
|
||||
|
||||
rs = DataFrame({"s": s}).diff()
|
||||
assert rs.s[1] == 1
|
||||
|
||||
# mixed numeric
|
||||
tf = self.tsframe.astype("float32")
|
||||
the_diff = tf.diff(1)
|
||||
assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1))
|
||||
|
||||
# issue 10907
|
||||
df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])})
|
||||
df.insert(0, "x", 1)
|
||||
result = df.diff(axis=1)
|
||||
expected = pd.DataFrame(
|
||||
{"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}
|
||||
).astype("float64")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis0(self, tz):
|
||||
# GH 18578
|
||||
df = DataFrame(
|
||||
{
|
||||
0: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
1: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.diff(axis=0)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: pd.TimedeltaIndex(["NaT", "1 days"]),
|
||||
1: pd.TimedeltaIndex(["NaT", "1 days"]),
|
||||
}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||||
def test_diff_datetime_axis1(self, tz):
|
||||
# GH 18578
|
||||
df = DataFrame(
|
||||
{
|
||||
0: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
1: date_range("2010", freq="D", periods=2, tz=tz),
|
||||
}
|
||||
)
|
||||
if tz is None:
|
||||
result = df.diff(axis=1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: pd.TimedeltaIndex(["NaT", "NaT"]),
|
||||
1: pd.TimedeltaIndex(["0 days", "0 days"]),
|
||||
}
|
||||
)
|
||||
assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
result = df.diff(axis=1)
|
||||
|
||||
def test_diff_timedelta(self):
|
||||
# GH 4533
|
||||
df = DataFrame(
|
||||
dict(
|
||||
time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")],
|
||||
value=[1.0, 2.0],
|
||||
)
|
||||
)
|
||||
|
||||
res = df.diff()
|
||||
exp = DataFrame(
|
||||
[[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"]
|
||||
)
|
||||
assert_frame_equal(res, exp)
|
||||
|
||||
def test_diff_mixed_dtype(self):
|
||||
df = DataFrame(np.random.randn(5, 3))
|
||||
df["A"] = np.array([1, 2, 3, 4, 5], dtype=object)
|
||||
|
||||
result = df.diff()
|
||||
assert result[0].dtype == np.float64
|
||||
|
||||
def test_diff_neg_n(self):
|
||||
rs = self.tsframe.diff(-1)
|
||||
xp = self.tsframe - self.tsframe.shift(-1)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_float_n(self):
|
||||
rs = self.tsframe.diff(1.0)
|
||||
xp = self.tsframe.diff(1)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_diff_axis(self):
|
||||
# GH 9727
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0]])
|
||||
assert_frame_equal(df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]))
|
||||
assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]))
|
||||
|
||||
def test_pct_change(self):
|
||||
rs = self.tsframe.pct_change(fill_method=None)
|
||||
assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(2)
|
||||
filled = self.tsframe.fillna(method="pad")
|
||||
assert_frame_equal(rs, filled / filled.shift(2) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(fill_method="bfill", limit=1)
|
||||
filled = self.tsframe.fillna(method="bfill", limit=1)
|
||||
assert_frame_equal(rs, filled / filled.shift(1) - 1)
|
||||
|
||||
rs = self.tsframe.pct_change(freq="5D")
|
||||
filled = self.tsframe.fillna(method="pad")
|
||||
assert_frame_equal(
|
||||
rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled)
|
||||
)
|
||||
|
||||
def test_pct_change_shift_over_nas(self):
|
||||
s = Series([1.0, 1.5, np.nan, 2.5, 3.0])
|
||||
|
||||
df = DataFrame({"a": s, "b": s})
|
||||
|
||||
chg = df.pct_change()
|
||||
expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2])
|
||||
edf = DataFrame({"a": expected, "b": expected})
|
||||
assert_frame_equal(chg, edf)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, periods, fill_method, limit",
|
||||
[
|
||||
("5B", 5, None, None),
|
||||
("3B", 3, None, None),
|
||||
("3B", 3, "bfill", None),
|
||||
("7B", 7, "pad", 1),
|
||||
("7B", 7, "bfill", 3),
|
||||
("14B", 14, None, None),
|
||||
],
|
||||
)
|
||||
def test_pct_change_periods_freq(self, freq, periods, fill_method, limit):
|
||||
# GH 7292
|
||||
rs_freq = self.tsframe.pct_change(
|
||||
freq=freq, fill_method=fill_method, limit=limit
|
||||
)
|
||||
rs_periods = self.tsframe.pct_change(
|
||||
periods, fill_method=fill_method, limit=limit
|
||||
)
|
||||
assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns)
|
||||
rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit)
|
||||
rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit)
|
||||
assert_frame_equal(rs_freq, rs_periods)
|
||||
|
||||
def test_frame_ctor_datetime64_column(self):
|
||||
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
|
||||
dates = np.asarray(rng)
|
||||
|
||||
df = DataFrame({"A": np.random.randn(len(rng)), "B": dates})
|
||||
assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]"))
|
||||
|
||||
def test_frame_append_datetime64_column(self):
|
||||
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
|
||||
df = DataFrame(index=np.arange(len(rng)))
|
||||
|
||||
df["A"] = rng
|
||||
assert np.issubdtype(df["A"].dtype, np.dtype("M8[ns]"))
|
||||
|
||||
def test_frame_datetime64_pre1900_repr(self):
|
||||
df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")})
|
||||
# it works!
|
||||
repr(df)
|
||||
|
||||
def test_frame_append_datetime64_col_other_units(self):
|
||||
n = 100
|
||||
|
||||
units = ["h", "m", "s", "ms", "D", "M", "Y"]
|
||||
|
||||
ns_dtype = np.dtype("M8[ns]")
|
||||
|
||||
for unit in units:
|
||||
dtype = np.dtype("M8[{unit}]".format(unit=unit))
|
||||
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||||
|
||||
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
|
||||
df[unit] = vals
|
||||
|
||||
ex_vals = to_datetime(vals.astype("O")).values
|
||||
|
||||
assert df[unit].dtype == ns_dtype
|
||||
assert (df[unit].values == ex_vals).all()
|
||||
|
||||
# Test insertion into existing datetime64 column
|
||||
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
|
||||
df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype)
|
||||
|
||||
for unit in units:
|
||||
dtype = np.dtype("M8[{unit}]".format(unit=unit))
|
||||
vals = np.arange(n, dtype=np.int64).view(dtype)
|
||||
|
||||
tmp = df.copy()
|
||||
|
||||
tmp["dates"] = vals
|
||||
ex_vals = to_datetime(vals.astype("O")).values
|
||||
|
||||
assert (tmp["dates"].values == ex_vals).all()
|
||||
|
||||
def test_shift(self):
|
||||
# naive shift
|
||||
shiftedFrame = self.tsframe.shift(5)
|
||||
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
|
||||
|
||||
shiftedSeries = self.tsframe["A"].shift(5)
|
||||
assert_series_equal(shiftedFrame["A"], shiftedSeries)
|
||||
|
||||
shiftedFrame = self.tsframe.shift(-5)
|
||||
tm.assert_index_equal(shiftedFrame.index, self.tsframe.index)
|
||||
|
||||
shiftedSeries = self.tsframe["A"].shift(-5)
|
||||
assert_series_equal(shiftedFrame["A"], shiftedSeries)
|
||||
|
||||
# shift by 0
|
||||
unshifted = self.tsframe.shift(0)
|
||||
assert_frame_equal(unshifted, self.tsframe)
|
||||
|
||||
# shift by DateOffset
|
||||
shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
|
||||
assert len(shiftedFrame) == len(self.tsframe)
|
||||
|
||||
shiftedFrame2 = self.tsframe.shift(5, freq="B")
|
||||
assert_frame_equal(shiftedFrame, shiftedFrame2)
|
||||
|
||||
d = self.tsframe.index[0]
|
||||
shifted_d = d + offsets.BDay(5)
|
||||
assert_series_equal(
|
||||
self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False
|
||||
)
|
||||
|
||||
# shift int frame
|
||||
int_shifted = self.intframe.shift(1) # noqa
|
||||
|
||||
# Shifting with PeriodIndex
|
||||
ps = tm.makePeriodFrame()
|
||||
shifted = ps.shift(1)
|
||||
unshifted = shifted.shift(-1)
|
||||
tm.assert_index_equal(shifted.index, ps.index)
|
||||
tm.assert_index_equal(unshifted.index, ps.index)
|
||||
tm.assert_numpy_array_equal(
|
||||
unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values
|
||||
)
|
||||
|
||||
shifted2 = ps.shift(1, "B")
|
||||
shifted3 = ps.shift(1, offsets.BDay())
|
||||
assert_frame_equal(shifted2, shifted3)
|
||||
assert_frame_equal(ps, shifted2.shift(-1, "B"))
|
||||
|
||||
msg = "does not match PeriodIndex freq"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ps.shift(freq="D")
|
||||
|
||||
# shift other axis
|
||||
# GH 6371
|
||||
df = DataFrame(np.random.rand(10, 5))
|
||||
expected = pd.concat(
|
||||
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
|
||||
ignore_index=True,
|
||||
axis=1,
|
||||
)
|
||||
result = df.shift(1, axis=1)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# shift named axis
|
||||
df = DataFrame(np.random.rand(10, 5))
|
||||
expected = pd.concat(
|
||||
[DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]],
|
||||
ignore_index=True,
|
||||
axis=1,
|
||||
)
|
||||
result = df.shift(1, axis="columns")
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_shift_bool(self):
|
||||
df = DataFrame({"high": [True, False], "low": [False, False]})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame(
|
||||
np.array([[np.nan, np.nan], [True, False]], dtype=object),
|
||||
columns=["high", "low"],
|
||||
)
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_categorical(self):
|
||||
# GH 9416
|
||||
s1 = pd.Series(["a", "b", "c"], dtype="category")
|
||||
s2 = pd.Series(["A", "B", "C"], dtype="category")
|
||||
df = DataFrame({"one": s1, "two": s2})
|
||||
rs = df.shift(1)
|
||||
xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)})
|
||||
assert_frame_equal(rs, xp)
|
||||
|
||||
def test_shift_fill_value(self):
|
||||
# GH #24128
|
||||
df = DataFrame(
|
||||
[1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H")
|
||||
)
|
||||
exp = DataFrame(
|
||||
[0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H")
|
||||
)
|
||||
result = df.shift(1, fill_value=0)
|
||||
assert_frame_equal(result, exp)
|
||||
|
||||
exp = DataFrame(
|
||||
[0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H")
|
||||
)
|
||||
result = df.shift(2, fill_value=0)
|
||||
assert_frame_equal(result, exp)
|
||||
|
||||
def test_shift_empty(self):
|
||||
# Regression test for #8019
|
||||
df = DataFrame({"foo": []})
|
||||
rs = df.shift(-1)
|
||||
|
||||
assert_frame_equal(df, rs)
|
||||
|
||||
def test_shift_duplicate_columns(self):
|
||||
# GH 9092; verify that position-based shifting works
|
||||
# in the presence of duplicate columns
|
||||
column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]]
|
||||
data = np.random.randn(20, 5)
|
||||
|
||||
shifted = []
|
||||
for columns in column_lists:
|
||||
df = pd.DataFrame(data.copy(), columns=columns)
|
||||
for s in range(5):
|
||||
df.iloc[:, s] = df.iloc[:, s].shift(s + 1)
|
||||
df.columns = range(5)
|
||||
shifted.append(df)
|
||||
|
||||
# sanity check the base case
|
||||
nulls = shifted[0].isna().sum()
|
||||
assert_series_equal(nulls, Series(range(1, 6), dtype="int64"))
|
||||
|
||||
# check all answers are the same
|
||||
assert_frame_equal(shifted[0], shifted[1])
|
||||
assert_frame_equal(shifted[0], shifted[2])
|
||||
|
||||
def test_tshift(self):
|
||||
# PeriodIndex
|
||||
ps = tm.makePeriodFrame()
|
||||
shifted = ps.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
|
||||
assert_frame_equal(unshifted, ps)
|
||||
|
||||
shifted2 = ps.tshift(freq="B")
|
||||
assert_frame_equal(shifted, shifted2)
|
||||
|
||||
shifted3 = ps.tshift(freq=offsets.BDay())
|
||||
assert_frame_equal(shifted, shifted3)
|
||||
|
||||
with pytest.raises(ValueError, match="does not match"):
|
||||
ps.tshift(freq="M")
|
||||
|
||||
# DatetimeIndex
|
||||
shifted = self.tsframe.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
|
||||
assert_frame_equal(self.tsframe, unshifted)
|
||||
|
||||
shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq)
|
||||
assert_frame_equal(shifted, shifted2)
|
||||
|
||||
inferred_ts = DataFrame(
|
||||
self.tsframe.values,
|
||||
Index(np.asarray(self.tsframe.index)),
|
||||
columns=self.tsframe.columns,
|
||||
)
|
||||
shifted = inferred_ts.tshift(1)
|
||||
unshifted = shifted.tshift(-1)
|
||||
assert_frame_equal(shifted, self.tsframe.tshift(1))
|
||||
assert_frame_equal(unshifted, inferred_ts)
|
||||
|
||||
no_freq = self.tsframe.iloc[[0, 5, 7], :]
|
||||
msg = "Freq was not given and was not set in the index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
no_freq.tshift()
|
||||
|
||||
def test_truncate(self):
|
||||
ts = self.tsframe[::3]
|
||||
|
||||
start, end = self.tsframe.index[3], self.tsframe.index[6]
|
||||
|
||||
start_missing = self.tsframe.index[2]
|
||||
end_missing = self.tsframe.index[7]
|
||||
|
||||
# neither specified
|
||||
truncated = ts.truncate()
|
||||
assert_frame_equal(truncated, ts)
|
||||
|
||||
# both specified
|
||||
expected = ts[1:3]
|
||||
|
||||
truncated = ts.truncate(start, end)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(start_missing, end_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
# start specified
|
||||
expected = ts[1:]
|
||||
|
||||
truncated = ts.truncate(before=start)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(before=start_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
# end specified
|
||||
expected = ts[:3]
|
||||
|
||||
truncated = ts.truncate(after=end)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
truncated = ts.truncate(after=end_missing)
|
||||
assert_frame_equal(truncated, expected)
|
||||
|
||||
msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.truncate(
|
||||
before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq
|
||||
)
|
||||
|
||||
def test_truncate_copy(self):
|
||||
index = self.tsframe.index
|
||||
truncated = self.tsframe.truncate(index[5], index[10])
|
||||
truncated.values[:] = 5.0
|
||||
assert not (self.tsframe.values[5:11] == 5).any()
|
||||
|
||||
def test_truncate_nonsortedindex(self):
|
||||
# GH 17935
|
||||
|
||||
df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0])
|
||||
msg = "truncate requires a sorted index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.truncate(before=3, after=9)
|
||||
|
||||
rng = pd.date_range("2011-01-01", "2012-01-01", freq="W")
|
||||
ts = pd.DataFrame(
|
||||
{"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng
|
||||
)
|
||||
msg = "truncate requires a sorted index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.sort_values("A", ascending=False).truncate(
|
||||
before="2011-11", after="2011-12"
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
3: np.random.randn(5),
|
||||
20: np.random.randn(5),
|
||||
2: np.random.randn(5),
|
||||
0: np.random.randn(5),
|
||||
},
|
||||
columns=[3, 20, 2, 0],
|
||||
)
|
||||
msg = "truncate requires a sorted index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.truncate(before=2, after=20, axis=1)
|
||||
|
||||
def test_asfreq(self):
|
||||
offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd())
|
||||
rule_monthly = self.tsframe.asfreq("BM")
|
||||
|
||||
tm.assert_almost_equal(offset_monthly["A"], rule_monthly["A"])
|
||||
|
||||
filled = rule_monthly.asfreq("B", method="pad") # noqa
|
||||
# TODO: actually check that this worked.
|
||||
|
||||
# don't forget!
|
||||
filled_dep = rule_monthly.asfreq("B", method="pad") # noqa
|
||||
|
||||
# test does not blow up on length-0 DataFrame
|
||||
zero_length = self.tsframe.reindex([])
|
||||
result = zero_length.asfreq("BM")
|
||||
assert result is not zero_length
|
||||
|
||||
def test_asfreq_datetimeindex(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3]},
|
||||
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
|
||||
)
|
||||
df = df.asfreq("B")
|
||||
assert isinstance(df.index, DatetimeIndex)
|
||||
|
||||
ts = df["A"].asfreq("B")
|
||||
assert isinstance(ts.index, DatetimeIndex)
|
||||
|
||||
def test_asfreq_fillvalue(self):
|
||||
# test for fill value during upsampling, related to issue 3715
|
||||
|
||||
# setup
|
||||
rng = pd.date_range("1/1/2016", periods=10, freq="2S")
|
||||
ts = pd.Series(np.arange(len(rng)), index=rng)
|
||||
df = pd.DataFrame({"one": ts})
|
||||
|
||||
# insert pre-existing missing value
|
||||
df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
|
||||
actual_df = df.asfreq(freq="1S", fill_value=9.0)
|
||||
expected_df = df.asfreq(freq="1S").fillna(9.0)
|
||||
expected_df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
assert_frame_equal(expected_df, actual_df)
|
||||
|
||||
expected_series = ts.asfreq(freq="1S").fillna(9.0)
|
||||
actual_series = ts.asfreq(freq="1S", fill_value=9.0)
|
||||
assert_series_equal(expected_series, actual_series)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,idx,expected_first,expected_last",
|
||||
[
|
||||
({"A": [1, 2, 3]}, [1, 1, 2], 1, 2),
|
||||
({"A": [1, 2, 3]}, [1, 2, 2], 1, 2),
|
||||
({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"),
|
||||
({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2),
|
||||
({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
|
||||
({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2),
|
||||
],
|
||||
)
|
||||
def test_first_last_valid(self, data, idx, expected_first, expected_last):
|
||||
N = len(self.frame.index)
|
||||
mat = np.random.randn(N)
|
||||
mat[:5] = np.nan
|
||||
mat[-5:] = np.nan
|
||||
|
||||
frame = DataFrame({"foo": mat}, index=self.frame.index)
|
||||
index = frame.first_valid_index()
|
||||
|
||||
assert index == frame.index[5]
|
||||
|
||||
index = frame.last_valid_index()
|
||||
assert index == frame.index[-6]
|
||||
|
||||
# GH12800
|
||||
empty = DataFrame()
|
||||
assert empty.last_valid_index() is None
|
||||
assert empty.first_valid_index() is None
|
||||
|
||||
# GH17400: no valid entries
|
||||
frame[:] = np.nan
|
||||
assert frame.last_valid_index() is None
|
||||
assert frame.first_valid_index() is None
|
||||
|
||||
# GH20499: its preserves freq with holes
|
||||
frame.index = date_range("20110101", periods=N, freq="B")
|
||||
frame.iloc[1] = 1
|
||||
frame.iloc[-2] = 1
|
||||
assert frame.first_valid_index() == frame.index[1]
|
||||
assert frame.last_valid_index() == frame.index[-2]
|
||||
assert frame.first_valid_index().freq == frame.index.freq
|
||||
assert frame.last_valid_index().freq == frame.index.freq
|
||||
|
||||
# GH 21441
|
||||
df = DataFrame(data, index=idx)
|
||||
assert expected_first == df.first_valid_index()
|
||||
assert expected_last == df.last_valid_index()
|
||||
|
||||
def test_first_subset(self):
|
||||
ts = tm.makeTimeDataFrame(freq="12h")
|
||||
result = ts.first("10d")
|
||||
assert len(result) == 20
|
||||
|
||||
ts = tm.makeTimeDataFrame(freq="D")
|
||||
result = ts.first("10d")
|
||||
assert len(result) == 10
|
||||
|
||||
result = ts.first("3M")
|
||||
expected = ts[:"3/31/2000"]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.first("21D")
|
||||
expected = ts[:21]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts[:0].first("3M")
|
||||
assert_frame_equal(result, ts[:0])
|
||||
|
||||
def test_first_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.first("1D")
|
||||
|
||||
def test_last_subset(self):
|
||||
ts = tm.makeTimeDataFrame(freq="12h")
|
||||
result = ts.last("10d")
|
||||
assert len(result) == 20
|
||||
|
||||
ts = tm.makeTimeDataFrame(nper=30, freq="D")
|
||||
result = ts.last("10d")
|
||||
assert len(result) == 10
|
||||
|
||||
result = ts.last("21D")
|
||||
expected = ts["2000-01-10":]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.last("21D")
|
||||
expected = ts[-21:]
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts[:0].last("3M")
|
||||
assert_frame_equal(result, ts[:0])
|
||||
|
||||
def test_last_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.last("1D")
|
||||
|
||||
def test_at_time(self):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
rs = ts.at_time(rng[1])
|
||||
assert (rs.index.hour == rng[1].hour).all()
|
||||
assert (rs.index.minute == rng[1].minute).all()
|
||||
assert (rs.index.second == rng[1].second).all()
|
||||
|
||||
result = ts.at_time("9:30")
|
||||
expected = ts.at_time(time(9, 30))
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
result = ts.loc[time(9, 30)]
|
||||
expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)]
|
||||
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# midnight, everything
|
||||
rng = date_range("1/1/2000", "1/31/2000")
|
||||
ts = DataFrame(np.random.randn(len(rng), 3), index=rng)
|
||||
|
||||
result = ts.at_time(time(0, 0))
|
||||
assert_frame_equal(result, ts)
|
||||
|
||||
# time doesn't exist
|
||||
rng = date_range("1/1/2012", freq="23Min", periods=384)
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), rng)
|
||||
rs = ts.at_time("16:00")
|
||||
assert len(rs) == 0
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)]
|
||||
)
|
||||
def test_at_time_errors(self, hour):
|
||||
# GH 24043
|
||||
dti = pd.date_range("2018", periods=3, freq="H")
|
||||
df = pd.DataFrame(list(range(len(dti))), index=dti)
|
||||
if getattr(hour, "tzinfo", None) is None:
|
||||
result = df.at_time(hour)
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Index must be timezone"):
|
||||
df.at_time(hour)
|
||||
|
||||
def test_at_time_tz(self):
|
||||
# GH 24043
|
||||
dti = pd.date_range("2018", periods=3, freq="H", tz="US/Pacific")
|
||||
df = pd.DataFrame(list(range(len(dti))), index=dti)
|
||||
result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern")))
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.at_time("00:00")
|
||||
|
||||
@pytest.mark.parametrize("axis", ["index", "columns", 0, 1])
|
||||
def test_at_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(np.random.randn(len(rng), len(rng)))
|
||||
ts.index, ts.columns = rng, rng
|
||||
|
||||
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
|
||||
|
||||
if axis in ["index", 0]:
|
||||
expected = ts.loc[indices, :]
|
||||
elif axis in ["columns", 1]:
|
||||
expected = ts.loc[:, indices]
|
||||
|
||||
result = ts.at_time("9:30", axis=axis)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
def test_between_time(self, close_open_fixture):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inc_start, inc_end = close_open_fixture
|
||||
|
||||
filtered = ts.between_time(stime, etime, inc_start, inc_end)
|
||||
exp_len = 13 * 4 + 1
|
||||
if not inc_start:
|
||||
exp_len -= 5
|
||||
if not inc_end:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inc_start:
|
||||
assert t >= stime
|
||||
else:
|
||||
assert t > stime
|
||||
|
||||
if inc_end:
|
||||
assert t <= etime
|
||||
else:
|
||||
assert t < etime
|
||||
|
||||
result = ts.between_time("00:00", "01:00")
|
||||
expected = ts.between_time(stime, etime)
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# across midnight
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(np.random.randn(len(rng), 2), index=rng)
|
||||
stime = time(22, 0)
|
||||
etime = time(9, 0)
|
||||
|
||||
filtered = ts.between_time(stime, etime, inc_start, inc_end)
|
||||
exp_len = (12 * 11 + 1) * 4 + 1
|
||||
if not inc_start:
|
||||
exp_len -= 4
|
||||
if not inc_end:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inc_start:
|
||||
assert (t >= stime) or (t <= etime)
|
||||
else:
|
||||
assert (t > stime) or (t <= etime)
|
||||
|
||||
if inc_end:
|
||||
assert (t <= etime) or (t >= stime)
|
||||
else:
|
||||
assert (t < etime) or (t >= stime)
|
||||
|
||||
def test_between_time_raises(self):
|
||||
# GH20725
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
with pytest.raises(TypeError): # index is not a DatetimeIndex
|
||||
df.between_time(start_time="00:00", end_time="12:00")
|
||||
|
||||
def test_between_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = DataFrame(np.random.randn(len(rng), len(rng)))
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
exp_len = 7
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.index = rng
|
||||
assert len(ts.between_time(stime, etime)) == exp_len
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
|
||||
|
||||
if axis in ["columns", 1]:
|
||||
ts.columns = rng
|
||||
selected = ts.between_time(stime, etime, axis=1).columns
|
||||
assert len(selected) == exp_len
|
||||
|
||||
def test_between_time_axis_raises(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
mask = np.arange(0, len(rng))
|
||||
rand_data = np.random.randn(len(rng), len(rng))
|
||||
ts = DataFrame(rand_data, index=rng, columns=rng)
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
if axis in ["columns", 1]:
|
||||
ts.index = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=0)
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.columns = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=1)
|
||||
|
||||
def test_operation_on_NaT(self):
|
||||
# Both NaT and Timestamp are in DataFrame.
|
||||
df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]})
|
||||
|
||||
res = df.min()
|
||||
exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.max()
|
||||
exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
# GH12941, only NaTs are in DataFrame.
|
||||
df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]})
|
||||
|
||||
res = df.min()
|
||||
exp = pd.Series([pd.NaT], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = df.max()
|
||||
exp = pd.Series([pd.NaT], index=["foo"])
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
def test_datetime_assignment_with_NaT_and_diff_time_units(self):
|
||||
# GH 7492
|
||||
data_ns = np.array([1, "nat"], dtype="datetime64[ns]")
|
||||
result = pd.Series(data_ns).to_frame()
|
||||
result["new"] = data_ns
|
||||
expected = pd.DataFrame(
|
||||
{0: [1, None], "new": [1, None]}, dtype="datetime64[ns]"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# OutOfBoundsDatetime error shouldn't occur
|
||||
data_s = np.array([1, "nat"], dtype="datetime64[s]")
|
||||
result["new"] = data_s
|
||||
expected = pd.DataFrame(
|
||||
{0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_frame_to_period(self):
|
||||
K = 5
|
||||
|
||||
dr = date_range("1/1/2000", "1/1/2001")
|
||||
pr = period_range("1/1/2000", "1/1/2001")
|
||||
df = DataFrame(np.random.randn(len(dr), K), index=dr)
|
||||
df["mix"] = "a"
|
||||
|
||||
pts = df.to_period()
|
||||
exp = df.copy()
|
||||
exp.index = pr
|
||||
assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period("M")
|
||||
tm.assert_index_equal(pts.index, exp.index.asfreq("M"))
|
||||
|
||||
df = df.T
|
||||
pts = df.to_period(axis=1)
|
||||
exp = df.copy()
|
||||
exp.columns = pr
|
||||
assert_frame_equal(pts, exp)
|
||||
|
||||
pts = df.to_period("M", axis=1)
|
||||
tm.assert_index_equal(pts.columns, exp.columns.asfreq("M"))
|
||||
|
||||
msg = "No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_period(axis=2)
|
||||
|
||||
@pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"])
|
||||
def test_tz_convert_and_localize(self, fn):
|
||||
l0 = date_range("20140701", periods=5, freq="D")
|
||||
l1 = date_range("20140701", periods=5, freq="D")
|
||||
|
||||
int_idx = Index(range(5))
|
||||
|
||||
if fn == "tz_convert":
|
||||
l0 = l0.tz_localize("UTC")
|
||||
l1 = l1.tz_localize("UTC")
|
||||
|
||||
for idx in [l0, l1]:
|
||||
|
||||
l0_expected = getattr(idx, fn)("US/Pacific")
|
||||
l1_expected = getattr(idx, fn)("US/Pacific")
|
||||
|
||||
df1 = DataFrame(np.ones(5), index=l0)
|
||||
df1 = getattr(df1, fn)("US/Pacific")
|
||||
assert_index_equal(df1.index, l0_expected)
|
||||
|
||||
# MultiIndex
|
||||
# GH7846
|
||||
df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1]))
|
||||
|
||||
df3 = getattr(df2, fn)("US/Pacific", level=0)
|
||||
assert not df3.index.levels[0].equals(l0)
|
||||
assert_index_equal(df3.index.levels[0], l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1)
|
||||
assert not df3.index.levels[1].equals(l1_expected)
|
||||
|
||||
df3 = getattr(df2, fn)("US/Pacific", level=1)
|
||||
assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
|
||||
|
||||
# TODO: untested
|
||||
df5 = getattr(df4, fn)("US/Pacific", level=1) # noqa
|
||||
|
||||
assert_index_equal(df3.index.levels[0], l0)
|
||||
assert not df3.index.levels[0].equals(l0_expected)
|
||||
assert_index_equal(df3.index.levels[1], l1_expected)
|
||||
assert not df3.index.levels[1].equals(l1)
|
||||
|
||||
# Bad Inputs
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with pytest.raises(TypeError, match="DatetimeIndex"):
|
||||
df = DataFrame(index=int_idx)
|
||||
df = getattr(df, fn)("US/Pacific")
|
||||
|
||||
# Not DatetimeIndex / PeriodIndex
|
||||
with pytest.raises(TypeError, match="DatetimeIndex"):
|
||||
df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0]))
|
||||
df = getattr(df, fn)("US/Pacific", level=0)
|
||||
|
||||
# Invalid level
|
||||
with pytest.raises(ValueError, match="not valid"):
|
||||
df = DataFrame(index=l0)
|
||||
df = getattr(df, fn)("US/Pacific", level=1)
|
||||
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Tests for DataFrame timezone-related methods
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Series
|
||||
from pandas.core.indexes.datetimes import date_range
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
class TestDataFrameTimezones:
|
||||
def test_frame_values_with_tz(self):
|
||||
tz = "US/Central"
|
||||
df = DataFrame({"A": date_range("2000", periods=4, tz=tz)})
|
||||
result = df.values
|
||||
expected = np.array(
|
||||
[
|
||||
[pd.Timestamp("2000-01-01", tz=tz)],
|
||||
[pd.Timestamp("2000-01-02", tz=tz)],
|
||||
[pd.Timestamp("2000-01-03", tz=tz)],
|
||||
[pd.Timestamp("2000-01-04", tz=tz)],
|
||||
]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# two columns, homogenous
|
||||
|
||||
df = df.assign(B=df.A)
|
||||
result = df.values
|
||||
expected = np.concatenate([expected, expected], axis=1)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# three columns, heterogenous
|
||||
est = "US/Eastern"
|
||||
df = df.assign(C=df.A.dt.tz_convert(est))
|
||||
|
||||
new = np.array(
|
||||
[
|
||||
[pd.Timestamp("2000-01-01T01:00:00", tz=est)],
|
||||
[pd.Timestamp("2000-01-02T01:00:00", tz=est)],
|
||||
[pd.Timestamp("2000-01-03T01:00:00", tz=est)],
|
||||
[pd.Timestamp("2000-01-04T01:00:00", tz=est)],
|
||||
]
|
||||
)
|
||||
expected = np.concatenate([expected, new], axis=1)
|
||||
result = df.values
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_frame_from_records_utc(self):
|
||||
rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)}
|
||||
|
||||
# it works
|
||||
DataFrame.from_records([rec], index="begin_time")
|
||||
|
||||
def test_frame_tz_localize(self):
|
||||
rng = date_range("1/1/2011", periods=100, freq="H")
|
||||
|
||||
df = DataFrame({"a": 1}, index=rng)
|
||||
result = df.tz_localize("utc")
|
||||
expected = DataFrame({"a": 1}, rng.tz_localize("UTC"))
|
||||
assert result.index.tz.zone == "UTC"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_localize("utc", axis=1)
|
||||
assert result.columns.tz.zone == "UTC"
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_frame_tz_convert(self):
|
||||
rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern")
|
||||
|
||||
df = DataFrame({"a": 1}, index=rng)
|
||||
result = df.tz_convert("Europe/Berlin")
|
||||
expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin"))
|
||||
assert result.index.tz.zone == "Europe/Berlin"
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = df.T
|
||||
result = df.tz_convert("Europe/Berlin", axis=1)
|
||||
assert result.columns.tz.zone == "Europe/Berlin"
|
||||
tm.assert_frame_equal(result, expected.T)
|
||||
|
||||
def test_frame_join_tzaware(self):
|
||||
test1 = DataFrame(
|
||||
np.zeros((6, 3)),
|
||||
index=date_range(
|
||||
"2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central"
|
||||
),
|
||||
)
|
||||
test2 = DataFrame(
|
||||
np.zeros((3, 3)),
|
||||
index=date_range(
|
||||
"2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"
|
||||
),
|
||||
columns=range(3, 6),
|
||||
)
|
||||
|
||||
result = test1.join(test2, how="outer")
|
||||
ex_index = test1.index.union(test2.index)
|
||||
|
||||
tm.assert_index_equal(result.index, ex_index)
|
||||
assert result.index.tz.zone == "US/Central"
|
||||
|
||||
def test_frame_add_tz_mismatch_converts_to_utc(self):
|
||||
rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern")
|
||||
df = DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"])
|
||||
|
||||
df_moscow = df.tz_convert("Europe/Moscow")
|
||||
result = df + df_moscow
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
result = df_moscow + df
|
||||
assert result.index.tz is pytz.utc
|
||||
|
||||
def test_frame_align_aware(self):
|
||||
idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern")
|
||||
idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern")
|
||||
df1 = DataFrame(np.random.randn(len(idx1), 3), idx1)
|
||||
df2 = DataFrame(np.random.randn(len(idx2), 3), idx2)
|
||||
new1, new2 = df1.align(df2)
|
||||
assert df1.index.tz == new1.index.tz
|
||||
assert df2.index.tz == new2.index.tz
|
||||
|
||||
# different timezones convert to UTC
|
||||
|
||||
# frame with frame
|
||||
df1_central = df1.tz_convert("US/Central")
|
||||
new1, new2 = df1.align(df1_central)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
# frame with Series
|
||||
new1, new2 = df1.align(df1_central[0], axis=0)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
df1[0].align(df1_central, axis=0)
|
||||
assert new1.index.tz == pytz.UTC
|
||||
assert new2.index.tz == pytz.UTC
|
||||
|
||||
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_frame_no_datetime64_dtype(self, tz):
|
||||
# after GH#7822
|
||||
# these retain the timezones on dict construction
|
||||
dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
|
||||
dr_tz = dr.tz_localize(tz)
|
||||
df = DataFrame({"A": "foo", "B": dr_tz}, index=dr)
|
||||
tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo)
|
||||
assert df["B"].dtype == tz_expected
|
||||
|
||||
# GH#2810 (with timezones)
|
||||
datetimes_naive = [ts.to_pydatetime() for ts in dr]
|
||||
datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
|
||||
df = DataFrame({"dr": dr})
|
||||
df["dr_tz"] = dr_tz
|
||||
df["datetimes_naive"] = datetimes_naive
|
||||
df["datetimes_with_tz"] = datetimes_with_tz
|
||||
result = df.dtypes
|
||||
expected = Series(
|
||||
[
|
||||
np.dtype("datetime64[ns]"),
|
||||
DatetimeTZDtype(tz=tz),
|
||||
np.dtype("datetime64[ns]"),
|
||||
DatetimeTZDtype(tz=tz),
|
||||
],
|
||||
index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"],
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_frame_reset_index(self, tz):
|
||||
dr = date_range("2012-06-02", periods=10, tz=tz)
|
||||
df = DataFrame(np.random.randn(len(dr)), dr)
|
||||
roundtripped = df.reset_index().set_index("index")
|
||||
xp = df.index.tz
|
||||
rs = roundtripped.index.tz
|
||||
assert xp == rs
|
||||
|
||||
@pytest.mark.parametrize("tz", [None, "America/New_York"])
|
||||
def test_boolean_compare_transpose_tzindex_with_dst(self, tz):
|
||||
# GH 19970
|
||||
idx = date_range("20161101", "20161130", freq="4H", tz=tz)
|
||||
df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx)
|
||||
result = df.T == df.T
|
||||
expected = DataFrame(True, index=list("ab"), columns=idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("copy", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]]
|
||||
)
|
||||
def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz):
|
||||
# GH 6326
|
||||
result = DataFrame(
|
||||
np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz)
|
||||
)
|
||||
getattr(result, method)("UTC", copy=copy)
|
||||
expected = DataFrame(
|
||||
np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture):
|
||||
# GH 25843
|
||||
tz = tz_aware_fixture
|
||||
result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]")
|
||||
expected = DataFrame({"d": [pd.Timestamp("2019")]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
1333
venv/lib/python3.6/site-packages/pandas/tests/frame/test_to_csv.py
Normal file
1333
venv/lib/python3.6/site-packages/pandas/tests/frame/test_to_csv.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,41 @@
|
||||
import pytest
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dataframe():
|
||||
return DataFrame({"a": [1, 2], "b": [3, 4]})
|
||||
|
||||
|
||||
class TestDataFrameValidate:
|
||||
"""Tests for error handling related to data types of method arguments."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"func",
|
||||
[
|
||||
"query",
|
||||
"eval",
|
||||
"set_index",
|
||||
"reset_index",
|
||||
"dropna",
|
||||
"drop_duplicates",
|
||||
"sort_values",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
|
||||
def test_validate_bool_args(self, dataframe, func, inplace):
|
||||
msg = 'For argument "inplace" expected type bool'
|
||||
kwargs = dict(inplace=inplace)
|
||||
|
||||
if func == "query":
|
||||
kwargs["expr"] = "a > b"
|
||||
elif func == "eval":
|
||||
kwargs["expr"] = "a + b"
|
||||
elif func == "set_index":
|
||||
kwargs["keys"] = ["a"]
|
||||
elif func == "sort_values":
|
||||
kwargs["by"] = ["a"]
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
getattr(dataframe, func)(**kwargs)
|
||||
Reference in New Issue
Block a user