2762 lines
93 KiB
Python
2762 lines
93 KiB
Python
from datetime import timedelta
|
|
import operator
|
|
from string import ascii_lowercase
|
|
import warnings
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pandas.util._test_decorators as td
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
Categorical,
|
|
DataFrame,
|
|
MultiIndex,
|
|
Series,
|
|
Timestamp,
|
|
date_range,
|
|
isna,
|
|
notna,
|
|
to_datetime,
|
|
to_timedelta,
|
|
)
|
|
import pandas.core.algorithms as algorithms
|
|
import pandas.core.nanops as nanops
|
|
import pandas.util.testing as tm
|
|
|
|
|
|
def assert_stat_op_calc(
|
|
opname,
|
|
alternative,
|
|
frame,
|
|
has_skipna=True,
|
|
check_dtype=True,
|
|
check_dates=False,
|
|
check_less_precise=False,
|
|
skipna_alternative=None,
|
|
):
|
|
"""
|
|
Check that operator opname works as advertised on frame
|
|
|
|
Parameters
|
|
----------
|
|
opname : string
|
|
Name of the operator to test on frame
|
|
alternative : function
|
|
Function that opname is tested against; i.e. "frame.opname()" should
|
|
equal "alternative(frame)".
|
|
frame : DataFrame
|
|
The object that the tests are executed on
|
|
has_skipna : bool, default True
|
|
Whether the method "opname" has the kwarg "skip_na"
|
|
check_dtype : bool, default True
|
|
Whether the dtypes of the result of "frame.opname()" and
|
|
"alternative(frame)" should be checked.
|
|
check_dates : bool, default false
|
|
Whether opname should be tested on a Datetime Series
|
|
check_less_precise : bool, default False
|
|
Whether results should only be compared approximately;
|
|
passed on to tm.assert_series_equal
|
|
skipna_alternative : function, default None
|
|
NaN-safe version of alternative
|
|
"""
|
|
|
|
f = getattr(frame, opname)
|
|
|
|
if check_dates:
|
|
df = DataFrame({"b": date_range("1/1/2001", periods=2)})
|
|
result = getattr(df, opname)()
|
|
assert isinstance(result, Series)
|
|
|
|
df["a"] = range(len(df))
|
|
result = getattr(df, opname)()
|
|
assert isinstance(result, Series)
|
|
assert len(result)
|
|
|
|
if has_skipna:
|
|
|
|
def wrapper(x):
|
|
return alternative(x.values)
|
|
|
|
skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative)
|
|
result0 = f(axis=0, skipna=False)
|
|
result1 = f(axis=1, skipna=False)
|
|
tm.assert_series_equal(
|
|
result0,
|
|
frame.apply(wrapper),
|
|
check_dtype=check_dtype,
|
|
check_less_precise=check_less_precise,
|
|
)
|
|
# HACK: win32
|
|
tm.assert_series_equal(
|
|
result1,
|
|
frame.apply(wrapper, axis=1),
|
|
check_dtype=False,
|
|
check_less_precise=check_less_precise,
|
|
)
|
|
else:
|
|
skipna_wrapper = alternative
|
|
|
|
result0 = f(axis=0)
|
|
result1 = f(axis=1)
|
|
tm.assert_series_equal(
|
|
result0,
|
|
frame.apply(skipna_wrapper),
|
|
check_dtype=check_dtype,
|
|
check_less_precise=check_less_precise,
|
|
)
|
|
|
|
if opname in ["sum", "prod"]:
|
|
expected = frame.apply(skipna_wrapper, axis=1)
|
|
tm.assert_series_equal(
|
|
result1, expected, check_dtype=False, check_less_precise=check_less_precise
|
|
)
|
|
|
|
# check dtypes
|
|
if check_dtype:
|
|
lcd_dtype = frame.values.dtype
|
|
assert lcd_dtype == result0.dtype
|
|
assert lcd_dtype == result1.dtype
|
|
|
|
# bad axis
|
|
with pytest.raises(ValueError, match="No axis named 2"):
|
|
f(axis=2)
|
|
|
|
# all NA case
|
|
if has_skipna:
|
|
all_na = frame * np.NaN
|
|
r0 = getattr(all_na, opname)(axis=0)
|
|
r1 = getattr(all_na, opname)(axis=1)
|
|
if opname in ["sum", "prod"]:
|
|
unit = 1 if opname == "prod" else 0 # result for empty sum/prod
|
|
expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
|
|
tm.assert_series_equal(r0, expected)
|
|
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
|
|
tm.assert_series_equal(r1, expected)
|
|
|
|
|
|
def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only=False):
|
|
"""
|
|
Check that API for operator opname works as advertised on frame
|
|
|
|
Parameters
|
|
----------
|
|
opname : string
|
|
Name of the operator to test on frame
|
|
float_frame : DataFrame
|
|
DataFrame with columns of type float
|
|
float_string_frame : DataFrame
|
|
DataFrame with both float and string columns
|
|
has_numeric_only : bool, default False
|
|
Whether the method "opname" has the kwarg "numeric_only"
|
|
"""
|
|
|
|
# make sure works on mixed-type frame
|
|
getattr(float_string_frame, opname)(axis=0)
|
|
getattr(float_string_frame, opname)(axis=1)
|
|
|
|
if has_numeric_only:
|
|
getattr(float_string_frame, opname)(axis=0, numeric_only=True)
|
|
getattr(float_string_frame, opname)(axis=1, numeric_only=True)
|
|
getattr(float_frame, opname)(axis=0, numeric_only=False)
|
|
getattr(float_frame, opname)(axis=1, numeric_only=False)
|
|
|
|
|
|
def assert_bool_op_calc(opname, alternative, frame, has_skipna=True):
|
|
"""
|
|
Check that bool operator opname works as advertised on frame
|
|
|
|
Parameters
|
|
----------
|
|
opname : string
|
|
Name of the operator to test on frame
|
|
alternative : function
|
|
Function that opname is tested against; i.e. "frame.opname()" should
|
|
equal "alternative(frame)".
|
|
frame : DataFrame
|
|
The object that the tests are executed on
|
|
has_skipna : bool, default True
|
|
Whether the method "opname" has the kwarg "skip_na"
|
|
"""
|
|
|
|
f = getattr(frame, opname)
|
|
|
|
if has_skipna:
|
|
|
|
def skipna_wrapper(x):
|
|
nona = x.dropna().values
|
|
return alternative(nona)
|
|
|
|
def wrapper(x):
|
|
return alternative(x.values)
|
|
|
|
result0 = f(axis=0, skipna=False)
|
|
result1 = f(axis=1, skipna=False)
|
|
|
|
tm.assert_series_equal(result0, frame.apply(wrapper))
|
|
tm.assert_series_equal(
|
|
result1, frame.apply(wrapper, axis=1), check_dtype=False
|
|
) # HACK: win32
|
|
else:
|
|
skipna_wrapper = alternative
|
|
wrapper = alternative
|
|
|
|
result0 = f(axis=0)
|
|
result1 = f(axis=1)
|
|
|
|
tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
|
|
tm.assert_series_equal(
|
|
result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False
|
|
)
|
|
|
|
# bad axis
|
|
with pytest.raises(ValueError, match="No axis named 2"):
|
|
f(axis=2)
|
|
|
|
# all NA case
|
|
if has_skipna:
|
|
all_na = frame * np.NaN
|
|
r0 = getattr(all_na, opname)(axis=0)
|
|
r1 = getattr(all_na, opname)(axis=1)
|
|
if opname == "any":
|
|
assert not r0.any()
|
|
assert not r1.any()
|
|
else:
|
|
assert r0.all()
|
|
assert r1.all()
|
|
|
|
|
|
def assert_bool_op_api(
|
|
opname, bool_frame_with_na, float_string_frame, has_bool_only=False
|
|
):
|
|
"""
|
|
Check that API for boolean operator opname works as advertised on frame
|
|
|
|
Parameters
|
|
----------
|
|
opname : string
|
|
Name of the operator to test on frame
|
|
float_frame : DataFrame
|
|
DataFrame with columns of type float
|
|
float_string_frame : DataFrame
|
|
DataFrame with both float and string columns
|
|
has_bool_only : bool, default False
|
|
Whether the method "opname" has the kwarg "bool_only"
|
|
"""
|
|
# make sure op works on mixed-type frame
|
|
mixed = float_string_frame
|
|
mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5
|
|
getattr(mixed, opname)(axis=0)
|
|
getattr(mixed, opname)(axis=1)
|
|
|
|
if has_bool_only:
|
|
getattr(mixed, opname)(axis=0, bool_only=True)
|
|
getattr(mixed, opname)(axis=1, bool_only=True)
|
|
getattr(bool_frame_with_na, opname)(axis=0, bool_only=False)
|
|
getattr(bool_frame_with_na, opname)(axis=1, bool_only=False)
|
|
|
|
|
|
class TestDataFrameAnalytics:
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Correlation and covariance
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_pearson(self, float_frame):
|
|
float_frame["A"][:5] = np.nan
|
|
float_frame["B"][5:10] = np.nan
|
|
|
|
self._check_method(float_frame, "pearson")
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_kendall(self, float_frame):
|
|
float_frame["A"][:5] = np.nan
|
|
float_frame["B"][5:10] = np.nan
|
|
|
|
self._check_method(float_frame, "kendall")
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_spearman(self, float_frame):
|
|
float_frame["A"][:5] = np.nan
|
|
float_frame["B"][5:10] = np.nan
|
|
|
|
self._check_method(float_frame, "spearman")
|
|
|
|
def _check_method(self, frame, method="pearson"):
|
|
correls = frame.corr(method=method)
|
|
expected = frame["A"].corr(frame["C"], method=method)
|
|
tm.assert_almost_equal(correls["A"]["C"], expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_non_numeric(self, float_frame, float_string_frame):
|
|
float_frame["A"][:5] = np.nan
|
|
float_frame["B"][5:10] = np.nan
|
|
|
|
# exclude non-numeric types
|
|
result = float_string_frame.corr()
|
|
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
|
def test_corr_nooverlap(self, meth):
|
|
# nothing in common
|
|
df = DataFrame(
|
|
{
|
|
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
|
|
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
|
|
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
|
}
|
|
)
|
|
rs = df.corr(meth)
|
|
assert isna(rs.loc["A", "B"])
|
|
assert isna(rs.loc["B", "A"])
|
|
assert rs.loc["A", "A"] == 1
|
|
assert rs.loc["B", "B"] == 1
|
|
assert isna(rs.loc["C", "C"])
|
|
|
|
@td.skip_if_no_scipy
|
|
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
|
|
def test_corr_constant(self, meth):
|
|
# constant --> all NA
|
|
|
|
df = DataFrame(
|
|
{
|
|
"A": [1, 1, 1, np.nan, np.nan, np.nan],
|
|
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
|
|
}
|
|
)
|
|
rs = df.corr(meth)
|
|
assert isna(rs.values).all()
|
|
|
|
def test_corr_int(self):
|
|
# dtypes other than float64 #1761
|
|
df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
|
|
|
|
df3.cov()
|
|
df3.corr()
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_int_and_boolean(self):
|
|
# when dtypes of pandas series are different
|
|
# then ndarray will have dtype=object,
|
|
# so it need to be properly handled
|
|
df = DataFrame({"a": [True, False], "b": [1, 0]})
|
|
|
|
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
|
for meth in ["pearson", "kendall", "spearman"]:
|
|
|
|
with warnings.catch_warnings(record=True):
|
|
warnings.simplefilter("ignore", RuntimeWarning)
|
|
result = df.corr(meth)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_corr_cov_independent_index_column(self):
|
|
# GH 14617
|
|
df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd"))
|
|
for method in ["cov", "corr"]:
|
|
result = getattr(df, method)()
|
|
assert result.index is not result.columns
|
|
assert result.index.equals(result.columns)
|
|
|
|
def test_corr_invalid_method(self):
|
|
# GH 22298
|
|
df = pd.DataFrame(np.random.normal(size=(10, 2)))
|
|
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.corr(method="____")
|
|
|
|
def test_cov(self, float_frame, float_string_frame):
|
|
# min_periods no NAs (corner case)
|
|
expected = float_frame.cov()
|
|
result = float_frame.cov(min_periods=len(float_frame))
|
|
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
result = float_frame.cov(min_periods=len(float_frame) + 1)
|
|
assert isna(result.values).all()
|
|
|
|
# with NAs
|
|
frame = float_frame.copy()
|
|
frame["A"][:5] = np.nan
|
|
frame["B"][5:10] = np.nan
|
|
result = float_frame.cov(min_periods=len(float_frame) - 8)
|
|
expected = float_frame.cov()
|
|
expected.loc["A", "B"] = np.nan
|
|
expected.loc["B", "A"] = np.nan
|
|
|
|
# regular
|
|
float_frame["A"][:5] = np.nan
|
|
float_frame["B"][:10] = np.nan
|
|
cov = float_frame.cov()
|
|
|
|
tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"]))
|
|
|
|
# exclude non-numeric types
|
|
result = float_string_frame.cov()
|
|
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Single column frame
|
|
df = DataFrame(np.linspace(0.0, 1.0, 10))
|
|
result = df.cov()
|
|
expected = DataFrame(
|
|
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
df.loc[0] = np.nan
|
|
result = df.cov()
|
|
expected = DataFrame(
|
|
np.cov(df.values[1:].T).reshape((1, 1)),
|
|
index=df.columns,
|
|
columns=df.columns,
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_corrwith(self, datetime_frame):
|
|
a = datetime_frame
|
|
noise = Series(np.random.randn(len(a)), index=a.index)
|
|
|
|
b = datetime_frame.add(noise, axis=0)
|
|
|
|
# make sure order does not matter
|
|
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
|
|
del b["B"]
|
|
|
|
colcorr = a.corrwith(b, axis=0)
|
|
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
|
|
|
|
rowcorr = a.corrwith(b, axis=1)
|
|
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
|
|
|
|
dropped = a.corrwith(b, axis=0, drop=True)
|
|
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
|
|
assert "B" not in dropped
|
|
|
|
dropped = a.corrwith(b, axis=1, drop=True)
|
|
assert a.index[-1] not in dropped.index
|
|
|
|
# non time-series data
|
|
index = ["a", "b", "c", "d", "e"]
|
|
columns = ["one", "two", "three", "four"]
|
|
df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns)
|
|
df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns)
|
|
correls = df1.corrwith(df2, axis=1)
|
|
for row in index[:4]:
|
|
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
|
|
|
|
def test_corrwith_with_objects(self):
|
|
df1 = tm.makeTimeDataFrame()
|
|
df2 = tm.makeTimeDataFrame()
|
|
cols = ["A", "B", "C", "D"]
|
|
|
|
df1["obj"] = "foo"
|
|
df2["obj"] = "bar"
|
|
|
|
result = df1.corrwith(df2)
|
|
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df1.corrwith(df2, axis=1)
|
|
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_corrwith_series(self, datetime_frame):
|
|
result = datetime_frame.corrwith(datetime_frame["A"])
|
|
expected = datetime_frame.apply(datetime_frame["A"].corr)
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_corrwith_matches_corrcoef(self):
|
|
df1 = DataFrame(np.arange(10000), columns=["a"])
|
|
df2 = DataFrame(np.arange(10000) ** 2, columns=["a"])
|
|
c1 = df1.corrwith(df2)["a"]
|
|
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
|
|
|
|
tm.assert_almost_equal(c1, c2)
|
|
assert c1 < 1
|
|
|
|
def test_corrwith_mixed_dtypes(self):
|
|
# GH 18570
|
|
df = pd.DataFrame(
|
|
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
|
|
)
|
|
s = pd.Series([0, 6, 7, 3])
|
|
result = df.corrwith(s)
|
|
corrs = [df["a"].corr(s), df["b"].corr(s)]
|
|
expected = pd.Series(data=corrs, index=["a", "b"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_corrwith_index_intersection(self):
|
|
df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
|
|
df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"])
|
|
|
|
result = df1.corrwith(df2, drop=True).index.sort_values()
|
|
expected = df1.columns.intersection(df2.columns).sort_values()
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_corrwith_index_union(self):
|
|
df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
|
|
df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"])
|
|
|
|
result = df1.corrwith(df2, drop=False).index.sort_values()
|
|
expected = df1.columns.union(df2.columns).sort_values()
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_corrwith_dup_cols(self):
|
|
# GH 21925
|
|
df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T)
|
|
df2 = df1.copy()
|
|
df2 = pd.concat((df2, df2[0]), axis=1)
|
|
|
|
result = df1.corrwith(df2)
|
|
expected = pd.Series(np.ones(4), index=[0, 0, 1, 2])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corrwith_spearman(self):
|
|
# GH 21925
|
|
df = pd.DataFrame(np.random.random(size=(100, 3)))
|
|
result = df.corrwith(df ** 2, method="spearman")
|
|
expected = Series(np.ones(len(result)))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corrwith_kendall(self):
|
|
# GH 21925
|
|
df = pd.DataFrame(np.random.random(size=(100, 3)))
|
|
result = df.corrwith(df ** 2, method="kendall")
|
|
expected = Series(np.ones(len(result)))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Describe
|
|
|
|
def test_bool_describe_in_mixed_frame(self):
|
|
df = DataFrame(
|
|
{
|
|
"string_data": ["a", "b", "c", "d", "e"],
|
|
"bool_data": [True, True, False, False, False],
|
|
"int_data": [10, 20, 30, 40, 50],
|
|
}
|
|
)
|
|
|
|
# Integer data are included in .describe() output,
|
|
# Boolean and string data are not.
|
|
result = df.describe()
|
|
expected = DataFrame(
|
|
{"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
|
|
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Top value is a boolean value that is False
|
|
result = df.describe(include=["bool"])
|
|
|
|
expected = DataFrame(
|
|
{"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_describe_empty_object(self):
|
|
# https://github.com/pandas-dev/pandas/issues/27183
|
|
df = pd.DataFrame({"A": [None, None]}, dtype=object)
|
|
result = df.describe()
|
|
expected = pd.DataFrame(
|
|
{"A": [0, 0, np.nan, np.nan]},
|
|
dtype=object,
|
|
index=["count", "unique", "top", "freq"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.iloc[:0].describe()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_describe_bool_frame(self):
|
|
# GH 13891
|
|
df = pd.DataFrame(
|
|
{
|
|
"bool_data_1": [False, False, True, True],
|
|
"bool_data_2": [False, True, True, True],
|
|
}
|
|
)
|
|
result = df.describe()
|
|
expected = DataFrame(
|
|
{"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]},
|
|
index=["count", "unique", "top", "freq"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = pd.DataFrame(
|
|
{
|
|
"bool_data": [False, False, True, True, False],
|
|
"int_data": [0, 1, 2, 3, 4],
|
|
}
|
|
)
|
|
result = df.describe()
|
|
expected = DataFrame(
|
|
{"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
|
|
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = pd.DataFrame(
|
|
{"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
|
|
)
|
|
result = df.describe()
|
|
expected = DataFrame(
|
|
{"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]},
|
|
index=["count", "unique", "top", "freq"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_describe_categorical(self):
|
|
df = DataFrame({"value": np.random.randint(0, 10000, 100)})
|
|
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
|
|
cat_labels = Categorical(labels, labels)
|
|
|
|
df = df.sort_values(by=["value"], ascending=True)
|
|
df["value_group"] = pd.cut(
|
|
df.value, range(0, 10500, 500), right=False, labels=cat_labels
|
|
)
|
|
cat = df
|
|
|
|
# Categoricals should not show up together with numerical columns
|
|
result = cat.describe()
|
|
assert len(result.columns) == 1
|
|
|
|
# In a frame, describe() for the cat should be the same as for string
|
|
# arrays (count, unique, top, freq)
|
|
|
|
cat = Categorical(
|
|
["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
|
|
)
|
|
s = Series(cat)
|
|
result = s.describe()
|
|
expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
cat = Series(Categorical(["a", "b", "c", "c"]))
|
|
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
|
|
result = df3.describe()
|
|
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
|
|
|
|
def test_describe_empty_categorical_column(self):
|
|
# GH 26397
|
|
# Ensure the index of an an empty categorical DataFrame column
|
|
# also contains (count, unique, top, freq)
|
|
df = pd.DataFrame({"empty_col": Categorical([])})
|
|
result = df.describe()
|
|
expected = DataFrame(
|
|
{"empty_col": [0, 0, np.nan, np.nan]},
|
|
index=["count", "unique", "top", "freq"],
|
|
dtype="object",
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
# ensure NaN, not None
|
|
assert np.isnan(result.iloc[2, 0])
|
|
assert np.isnan(result.iloc[3, 0])
|
|
|
|
def test_describe_categorical_columns(self):
|
|
# GH 11558
|
|
columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
|
|
df = DataFrame(
|
|
{
|
|
"int1": [10, 20, 30, 40, 50],
|
|
"int2": [10, 20, 30, 40, 50],
|
|
"obj": ["A", 0, None, "X", 1],
|
|
},
|
|
columns=columns,
|
|
)
|
|
result = df.describe()
|
|
|
|
exp_columns = pd.CategoricalIndex(
|
|
["int1", "int2"],
|
|
categories=["int1", "int2", "obj"],
|
|
ordered=True,
|
|
name="XXX",
|
|
)
|
|
expected = DataFrame(
|
|
{
|
|
"int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
|
|
"int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
|
|
},
|
|
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
columns=exp_columns,
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_categorical_equal(result.columns.values, expected.columns.values)
|
|
|
|
def test_describe_datetime_columns(self):
|
|
columns = pd.DatetimeIndex(
|
|
["2011-01-01", "2011-02-01", "2011-03-01"],
|
|
freq="MS",
|
|
tz="US/Eastern",
|
|
name="XXX",
|
|
)
|
|
df = DataFrame(
|
|
{
|
|
0: [10, 20, 30, 40, 50],
|
|
1: [10, 20, 30, 40, 50],
|
|
2: ["A", 0, None, "X", 1],
|
|
}
|
|
)
|
|
df.columns = columns
|
|
result = df.describe()
|
|
|
|
exp_columns = pd.DatetimeIndex(
|
|
["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
|
|
)
|
|
expected = DataFrame(
|
|
{
|
|
0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
|
|
1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
|
|
},
|
|
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
)
|
|
expected.columns = exp_columns
|
|
tm.assert_frame_equal(result, expected)
|
|
assert result.columns.freq == "MS"
|
|
assert result.columns.tz == expected.columns.tz
|
|
|
|
def test_describe_timedelta_values(self):
|
|
# GH 6145
|
|
t1 = pd.timedelta_range("1 days", freq="D", periods=5)
|
|
t2 = pd.timedelta_range("1 hours", freq="H", periods=5)
|
|
df = pd.DataFrame({"t1": t1, "t2": t2})
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"t1": [
|
|
5,
|
|
pd.Timedelta("3 days"),
|
|
df.iloc[:, 0].std(),
|
|
pd.Timedelta("1 days"),
|
|
pd.Timedelta("2 days"),
|
|
pd.Timedelta("3 days"),
|
|
pd.Timedelta("4 days"),
|
|
pd.Timedelta("5 days"),
|
|
],
|
|
"t2": [
|
|
5,
|
|
pd.Timedelta("3 hours"),
|
|
df.iloc[:, 1].std(),
|
|
pd.Timedelta("1 hours"),
|
|
pd.Timedelta("2 hours"),
|
|
pd.Timedelta("3 hours"),
|
|
pd.Timedelta("4 hours"),
|
|
pd.Timedelta("5 hours"),
|
|
],
|
|
},
|
|
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
)
|
|
|
|
result = df.describe()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
exp_repr = (
|
|
" t1 t2\n"
|
|
"count 5 5\n"
|
|
"mean 3 days 00:00:00 0 days 03:00:00\n"
|
|
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
|
|
"min 1 days 00:00:00 0 days 01:00:00\n"
|
|
"25% 2 days 00:00:00 0 days 02:00:00\n"
|
|
"50% 3 days 00:00:00 0 days 03:00:00\n"
|
|
"75% 4 days 00:00:00 0 days 04:00:00\n"
|
|
"max 5 days 00:00:00 0 days 05:00:00"
|
|
)
|
|
assert repr(result) == exp_repr
|
|
|
|
def test_describe_tz_values(self, tz_naive_fixture):
|
|
# GH 21332
|
|
tz = tz_naive_fixture
|
|
s1 = Series(range(5))
|
|
start = Timestamp(2018, 1, 1)
|
|
end = Timestamp(2018, 1, 5)
|
|
s2 = Series(date_range(start, end, tz=tz))
|
|
df = pd.DataFrame({"s1": s1, "s2": s2})
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"s1": [
|
|
5,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
2,
|
|
1.581139,
|
|
0,
|
|
1,
|
|
2,
|
|
3,
|
|
4,
|
|
],
|
|
"s2": [
|
|
5,
|
|
5,
|
|
s2.value_counts().index[0],
|
|
1,
|
|
start.tz_localize(tz),
|
|
end.tz_localize(tz),
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
],
|
|
},
|
|
index=[
|
|
"count",
|
|
"unique",
|
|
"top",
|
|
"freq",
|
|
"first",
|
|
"last",
|
|
"mean",
|
|
"std",
|
|
"min",
|
|
"25%",
|
|
"50%",
|
|
"75%",
|
|
"max",
|
|
],
|
|
)
|
|
result = df.describe(include="all")
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_describe_percentiles_integer_idx(self):
|
|
# Issue 26660
|
|
df = pd.DataFrame({"x": [1]})
|
|
pct = np.linspace(0, 1, 10 + 1)
|
|
result = df.describe(percentiles=pct)
|
|
|
|
expected = DataFrame(
|
|
{"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]},
|
|
index=[
|
|
"count",
|
|
"mean",
|
|
"std",
|
|
"min",
|
|
"0%",
|
|
"10%",
|
|
"20%",
|
|
"30%",
|
|
"40%",
|
|
"50%",
|
|
"60%",
|
|
"70%",
|
|
"80%",
|
|
"90%",
|
|
"100%",
|
|
"max",
|
|
],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Reductions
|
|
|
|
def test_stat_op_api(self, float_frame, float_string_frame):
|
|
assert_stat_op_api(
|
|
"count", float_frame, float_string_frame, has_numeric_only=True
|
|
)
|
|
assert_stat_op_api(
|
|
"sum", float_frame, float_string_frame, has_numeric_only=True
|
|
)
|
|
|
|
assert_stat_op_api("nunique", float_frame, float_string_frame)
|
|
assert_stat_op_api("mean", float_frame, float_string_frame)
|
|
assert_stat_op_api("product", float_frame, float_string_frame)
|
|
assert_stat_op_api("median", float_frame, float_string_frame)
|
|
assert_stat_op_api("min", float_frame, float_string_frame)
|
|
assert_stat_op_api("max", float_frame, float_string_frame)
|
|
assert_stat_op_api("mad", float_frame, float_string_frame)
|
|
assert_stat_op_api("var", float_frame, float_string_frame)
|
|
assert_stat_op_api("std", float_frame, float_string_frame)
|
|
assert_stat_op_api("sem", float_frame, float_string_frame)
|
|
assert_stat_op_api("median", float_frame, float_string_frame)
|
|
|
|
try:
|
|
from scipy.stats import skew, kurtosis # noqa:F401
|
|
|
|
assert_stat_op_api("skew", float_frame, float_string_frame)
|
|
assert_stat_op_api("kurt", float_frame, float_string_frame)
|
|
except ImportError:
|
|
pass
|
|
|
|
def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame):
|
|
def count(s):
|
|
return notna(s).sum()
|
|
|
|
def nunique(s):
|
|
return len(algorithms.unique1d(s.dropna()))
|
|
|
|
def mad(x):
|
|
return np.abs(x - x.mean()).mean()
|
|
|
|
def var(x):
|
|
return np.var(x, ddof=1)
|
|
|
|
def std(x):
|
|
return np.std(x, ddof=1)
|
|
|
|
def sem(x):
|
|
return np.std(x, ddof=1) / np.sqrt(len(x))
|
|
|
|
def skewness(x):
|
|
from scipy.stats import skew # noqa:F811
|
|
|
|
if len(x) < 3:
|
|
return np.nan
|
|
return skew(x, bias=False)
|
|
|
|
def kurt(x):
|
|
from scipy.stats import kurtosis # noqa:F811
|
|
|
|
if len(x) < 4:
|
|
return np.nan
|
|
return kurtosis(x, bias=False)
|
|
|
|
assert_stat_op_calc(
|
|
"nunique",
|
|
nunique,
|
|
float_frame_with_na,
|
|
has_skipna=False,
|
|
check_dtype=False,
|
|
check_dates=True,
|
|
)
|
|
|
|
# mixed types (with upcasting happening)
|
|
assert_stat_op_calc(
|
|
"sum",
|
|
np.sum,
|
|
mixed_float_frame.astype("float32"),
|
|
check_dtype=False,
|
|
check_less_precise=True,
|
|
)
|
|
|
|
assert_stat_op_calc(
|
|
"sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
|
|
)
|
|
assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
|
|
assert_stat_op_calc("product", np.prod, float_frame_with_na)
|
|
|
|
assert_stat_op_calc("mad", mad, float_frame_with_na)
|
|
assert_stat_op_calc("var", var, float_frame_with_na)
|
|
assert_stat_op_calc("std", std, float_frame_with_na)
|
|
assert_stat_op_calc("sem", sem, float_frame_with_na)
|
|
|
|
assert_stat_op_calc(
|
|
"count",
|
|
count,
|
|
float_frame_with_na,
|
|
has_skipna=False,
|
|
check_dtype=False,
|
|
check_dates=True,
|
|
)
|
|
|
|
try:
|
|
from scipy import skew, kurtosis # noqa:F401
|
|
|
|
assert_stat_op_calc("skew", skewness, float_frame_with_na)
|
|
assert_stat_op_calc("kurt", kurt, float_frame_with_na)
|
|
except ImportError:
|
|
pass
|
|
|
|
# TODO: Ensure warning isn't emitted in the first place
|
|
@pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning")
|
|
def test_median(self, float_frame_with_na, int_frame):
|
|
def wrapper(x):
|
|
if isna(x).any():
|
|
return np.nan
|
|
return np.median(x)
|
|
|
|
assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True)
|
|
assert_stat_op_calc(
|
|
"median", wrapper, int_frame, check_dtype=False, check_dates=True
|
|
)
|
|
|
|
@pytest.mark.parametrize(
|
|
"method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"]
|
|
)
|
|
def test_stat_operators_attempt_obj_array(self, method):
|
|
# GH#676
|
|
data = {
|
|
"a": [
|
|
-0.00049987540199591344,
|
|
-0.0016467257772919831,
|
|
0.00067695870775883013,
|
|
],
|
|
"b": [-0, -0, 0.0],
|
|
"c": [
|
|
0.00031111847529610595,
|
|
0.0014902627951905339,
|
|
-0.00094099200035979691,
|
|
],
|
|
}
|
|
df1 = DataFrame(data, index=["foo", "bar", "baz"], dtype="O")
|
|
|
|
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object)
|
|
|
|
for df in [df1, df2]:
|
|
assert df.values.dtype == np.object_
|
|
result = getattr(df, method)(1)
|
|
expected = getattr(df.astype("f8"), method)(1)
|
|
|
|
if method in ["sum", "prod"]:
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
|
|
def test_mixed_ops(self, op):
|
|
# GH#16116
|
|
df = DataFrame(
|
|
{
|
|
"int": [1, 2, 3, 4],
|
|
"float": [1.0, 2.0, 3.0, 4.0],
|
|
"str": ["a", "b", "c", "d"],
|
|
}
|
|
)
|
|
|
|
result = getattr(df, op)()
|
|
assert len(result) == 2
|
|
|
|
with pd.option_context("use_bottleneck", False):
|
|
result = getattr(df, op)()
|
|
assert len(result) == 2
|
|
|
|
def test_reduce_mixed_frame(self):
|
|
# GH 6806
|
|
df = DataFrame(
|
|
{
|
|
"bool_data": [True, True, False, False, False],
|
|
"int_data": [10, 20, 30, 40, 50],
|
|
"string_data": ["a", "b", "c", "d", "e"],
|
|
}
|
|
)
|
|
df.reindex(columns=["bool_data", "int_data", "string_data"])
|
|
test = df.sum(axis=0)
|
|
tm.assert_numpy_array_equal(
|
|
test.values, np.array([2, 150, "abcde"], dtype=object)
|
|
)
|
|
tm.assert_series_equal(test, df.T.sum(axis=1))
|
|
|
|
def test_nunique(self):
|
|
df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]})
|
|
tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2}))
|
|
tm.assert_series_equal(
|
|
df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3})
|
|
)
|
|
tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
|
|
tm.assert_series_equal(
|
|
df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2})
|
|
)
|
|
|
|
@pytest.mark.parametrize("tz", [None, "UTC"])
|
|
def test_mean_mixed_datetime_numeric(self, tz):
|
|
# https://github.com/pandas-dev/pandas/issues/24752
|
|
df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2})
|
|
result = df.mean()
|
|
expected = pd.Series([1.0], index=["A"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("tz", [None, "UTC"])
|
|
def test_mean_excludeds_datetimes(self, tz):
|
|
# https://github.com/pandas-dev/pandas/issues/24752
|
|
# Our long-term desired behavior is unclear, but the behavior in
|
|
# 0.24.0rc1 was buggy.
|
|
df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2})
|
|
result = df.mean()
|
|
expected = pd.Series()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_var_std(self, datetime_frame):
|
|
result = datetime_frame.std(ddof=4)
|
|
expected = datetime_frame.apply(lambda x: x.std(ddof=4))
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
result = datetime_frame.var(ddof=4)
|
|
expected = datetime_frame.apply(lambda x: x.var(ddof=4))
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
|
|
result = nanops.nanvar(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
with pd.option_context("use_bottleneck", False):
|
|
result = nanops.nanvar(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
@pytest.mark.parametrize("meth", ["sem", "var", "std"])
|
|
def test_numeric_only_flag(self, meth):
|
|
# GH 9201
|
|
df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
|
|
# set one entry to a number in str format
|
|
df1.loc[0, "foo"] = "100"
|
|
|
|
df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
|
|
# set one entry to a non-number str
|
|
df2.loc[0, "foo"] = "a"
|
|
|
|
result = getattr(df1, meth)(axis=1, numeric_only=True)
|
|
expected = getattr(df1[["bar", "baz"]], meth)(axis=1)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
result = getattr(df2, meth)(axis=1, numeric_only=True)
|
|
expected = getattr(df2[["bar", "baz"]], meth)(axis=1)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
# df1 has all numbers, df2 has a letter inside
|
|
msg = r"unsupported operand type\(s\) for -: 'float' and 'str'"
|
|
with pytest.raises(TypeError, match=msg):
|
|
getattr(df1, meth)(axis=1, numeric_only=False)
|
|
msg = "could not convert string to float: 'a'"
|
|
with pytest.raises(TypeError, match=msg):
|
|
getattr(df2, meth)(axis=1, numeric_only=False)
|
|
|
|
def test_sem(self, datetime_frame):
|
|
result = datetime_frame.sem(ddof=4)
|
|
expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x)))
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
|
|
result = nanops.nansem(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
with pd.option_context("use_bottleneck", False):
|
|
result = nanops.nansem(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_kurt(self):
|
|
index = MultiIndex(
|
|
levels=[["bar"], ["one", "two", "three"], [0, 1]],
|
|
codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
|
|
)
|
|
df = DataFrame(np.random.randn(6, 3), index=index)
|
|
|
|
kurt = df.kurt()
|
|
kurt2 = df.kurt(level=0).xs("bar")
|
|
tm.assert_series_equal(kurt, kurt2, check_names=False)
|
|
assert kurt.name is None
|
|
assert kurt2.name == "bar"
|
|
|
|
@pytest.mark.parametrize(
|
|
"dropna, expected",
|
|
[
|
|
(
|
|
True,
|
|
{
|
|
"A": [12],
|
|
"B": [10.0],
|
|
"C": [1.0],
|
|
"D": ["a"],
|
|
"E": Categorical(["a"], categories=["a"]),
|
|
"F": to_datetime(["2000-1-2"]),
|
|
"G": to_timedelta(["1 days"]),
|
|
},
|
|
),
|
|
(
|
|
False,
|
|
{
|
|
"A": [12],
|
|
"B": [10.0],
|
|
"C": [np.nan],
|
|
"D": np.array([np.nan], dtype=object),
|
|
"E": Categorical([np.nan], categories=["a"]),
|
|
"F": [pd.NaT],
|
|
"G": to_timedelta([pd.NaT]),
|
|
},
|
|
),
|
|
(
|
|
True,
|
|
{
|
|
"H": [8, 9, np.nan, np.nan],
|
|
"I": [8, 9, np.nan, np.nan],
|
|
"J": [1, np.nan, np.nan, np.nan],
|
|
"K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]),
|
|
"L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]),
|
|
"M": to_timedelta(["1 days", "nan", "nan", "nan"]),
|
|
"N": [0, 1, 2, 3],
|
|
},
|
|
),
|
|
(
|
|
False,
|
|
{
|
|
"H": [8, 9, np.nan, np.nan],
|
|
"I": [8, 9, np.nan, np.nan],
|
|
"J": [1, np.nan, np.nan, np.nan],
|
|
"K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]),
|
|
"L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
|
|
"M": to_timedelta(["nan", "1 days", "nan", "nan"]),
|
|
"N": [0, 1, 2, 3],
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_mode_dropna(self, dropna, expected):
|
|
|
|
df = DataFrame(
|
|
{
|
|
"A": [12, 12, 19, 11],
|
|
"B": [10, 10, np.nan, 3],
|
|
"C": [1, np.nan, np.nan, np.nan],
|
|
"D": [np.nan, np.nan, "a", np.nan],
|
|
"E": Categorical([np.nan, np.nan, "a", np.nan]),
|
|
"F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
|
|
"G": to_timedelta(["1 days", "nan", "nan", "nan"]),
|
|
"H": [8, 8, 9, 9],
|
|
"I": [9, 9, 8, 8],
|
|
"J": [1, 1, np.nan, np.nan],
|
|
"K": Categorical(["a", np.nan, "a", np.nan]),
|
|
"L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]),
|
|
"M": to_timedelta(["1 days", "nan", "1 days", "nan"]),
|
|
"N": np.arange(4, dtype="int64"),
|
|
}
|
|
)
|
|
|
|
result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
|
|
expected = DataFrame(expected)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_mode_sortwarning(self):
|
|
# Check for the warning that is raised when the mode
|
|
# results cannot be sorted
|
|
|
|
df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
|
|
expected = DataFrame({"A": ["a", np.nan]})
|
|
|
|
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
|
result = df.mode(dropna=False)
|
|
result = result.sort_values(by="A").reset_index(drop=True)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_operators_timedelta64(self):
|
|
df = DataFrame(
|
|
dict(
|
|
A=date_range("2012-1-1", periods=3, freq="D"),
|
|
B=date_range("2012-1-2", periods=3, freq="D"),
|
|
C=Timestamp("20120101") - timedelta(minutes=5, seconds=5),
|
|
)
|
|
)
|
|
|
|
diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"]))
|
|
|
|
# min
|
|
result = diffs.min()
|
|
assert result[0] == diffs.loc[0, "A"]
|
|
assert result[1] == diffs.loc[0, "B"]
|
|
|
|
result = diffs.min(axis=1)
|
|
assert (result == diffs.loc[0, "B"]).all()
|
|
|
|
# max
|
|
result = diffs.max()
|
|
assert result[0] == diffs.loc[2, "A"]
|
|
assert result[1] == diffs.loc[2, "B"]
|
|
|
|
result = diffs.max(axis=1)
|
|
assert (result == diffs["A"]).all()
|
|
|
|
# abs
|
|
result = diffs.abs()
|
|
result2 = abs(diffs)
|
|
expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"]))
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
# mixed frame
|
|
mixed = diffs.copy()
|
|
mixed["C"] = "foo"
|
|
mixed["D"] = 1
|
|
mixed["E"] = 1.0
|
|
mixed["F"] = Timestamp("20130101")
|
|
|
|
# results in an object array
|
|
result = mixed.min()
|
|
expected = Series(
|
|
[
|
|
pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
|
|
pd.Timedelta(timedelta(days=-1)),
|
|
"foo",
|
|
1,
|
|
1.0,
|
|
Timestamp("20130101"),
|
|
],
|
|
index=mixed.columns,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# excludes numeric
|
|
result = mixed.min(axis=1)
|
|
expected = Series([1, 1, 1.0], index=[0, 1, 2])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# works when only those columns are selected
|
|
result = mixed[["A", "B"]].min(1)
|
|
expected = Series([timedelta(days=-1)] * 3)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = mixed[["A", "B"]].min()
|
|
expected = Series(
|
|
[timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"]
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# GH 3106
|
|
df = DataFrame(
|
|
{
|
|
"time": date_range("20130102", periods=5),
|
|
"time2": date_range("20130105", periods=5),
|
|
}
|
|
)
|
|
df["off1"] = df["time2"] - df["time"]
|
|
assert df["off1"].dtype == "timedelta64[ns]"
|
|
|
|
df["off2"] = df["time"] - df["time2"]
|
|
df._consolidate_inplace()
|
|
assert df["off1"].dtype == "timedelta64[ns]"
|
|
assert df["off2"].dtype == "timedelta64[ns]"
|
|
|
|
def test_sum_corner(self):
|
|
empty_frame = DataFrame()
|
|
|
|
axis0 = empty_frame.sum(0)
|
|
axis1 = empty_frame.sum(1)
|
|
assert isinstance(axis0, Series)
|
|
assert isinstance(axis1, Series)
|
|
assert len(axis0) == 0
|
|
assert len(axis1) == 0
|
|
|
|
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
|
|
def test_sum_prod_nanops(self, method, unit):
|
|
idx = ["a", "b", "c"]
|
|
df = pd.DataFrame(
|
|
{"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}
|
|
)
|
|
# The default
|
|
result = getattr(df, method)
|
|
expected = pd.Series([unit, unit, unit], index=idx, dtype="float64")
|
|
|
|
# min_count=1
|
|
result = getattr(df, method)(min_count=1)
|
|
expected = pd.Series([unit, unit, np.nan], index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = getattr(df, method)(min_count=0)
|
|
expected = pd.Series([unit, unit, unit], index=idx, dtype="float64")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = getattr(df.iloc[1:], method)(min_count=1)
|
|
expected = pd.Series([unit, np.nan, np.nan], index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count > 1
|
|
df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
|
|
result = getattr(df, method)(min_count=5)
|
|
expected = pd.Series(result, index=["A", "B"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = getattr(df, method)(min_count=6)
|
|
expected = pd.Series(result, index=["A", "B"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_sum_nanops_timedelta(self):
|
|
# prod isn't defined on timedeltas
|
|
idx = ["a", "b", "c"]
|
|
df = pd.DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]})
|
|
|
|
df2 = df.apply(pd.to_timedelta)
|
|
|
|
# 0 by default
|
|
result = df2.sum()
|
|
expected = pd.Series([0, 0, 0], dtype="m8[ns]", index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = df2.sum(min_count=0)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=1
|
|
result = df2.sum(min_count=1)
|
|
expected = pd.Series([0, 0, np.nan], dtype="m8[ns]", index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_sum_object(self, float_frame):
|
|
values = float_frame.values.astype(int)
|
|
frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns)
|
|
deltas = frame * timedelta(1)
|
|
deltas.sum()
|
|
|
|
def test_sum_bool(self, float_frame):
|
|
# ensure this works, bug report
|
|
bools = np.isnan(float_frame)
|
|
bools.sum(1)
|
|
bools.sum(0)
|
|
|
|
def test_mean_corner(self, float_frame, float_string_frame):
|
|
# unit test when have object data
|
|
the_mean = float_string_frame.mean(axis=0)
|
|
the_sum = float_string_frame.sum(axis=0, numeric_only=True)
|
|
tm.assert_index_equal(the_sum.index, the_mean.index)
|
|
assert len(the_mean.index) < len(float_string_frame.columns)
|
|
|
|
# xs sum mixed type, just want to know it works...
|
|
the_mean = float_string_frame.mean(axis=1)
|
|
the_sum = float_string_frame.sum(axis=1, numeric_only=True)
|
|
tm.assert_index_equal(the_sum.index, the_mean.index)
|
|
|
|
# take mean of boolean column
|
|
float_frame["bool"] = float_frame["A"] > 0
|
|
means = float_frame.mean(0)
|
|
assert means["bool"] == float_frame["bool"].values.mean()
|
|
|
|
def test_mean_datetimelike(self):
|
|
# GH#24757 check that datetimelike are excluded by default, handled
|
|
# correctly with numeric_only=True
|
|
|
|
df = pd.DataFrame(
|
|
{
|
|
"A": np.arange(3),
|
|
"B": pd.date_range("2016-01-01", periods=3),
|
|
"C": pd.timedelta_range("1D", periods=3),
|
|
"D": pd.period_range("2016", periods=3, freq="A"),
|
|
}
|
|
)
|
|
result = df.mean(numeric_only=True)
|
|
expected = pd.Series({"A": 1.0})
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.mean()
|
|
expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]})
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.xfail(
|
|
reason="casts to object-dtype and then tries to add timestamps",
|
|
raises=TypeError,
|
|
strict=True,
|
|
)
|
|
def test_mean_datetimelike_numeric_only_false(self):
|
|
df = pd.DataFrame(
|
|
{
|
|
"A": np.arange(3),
|
|
"B": pd.date_range("2016-01-01", periods=3),
|
|
"C": pd.timedelta_range("1D", periods=3),
|
|
"D": pd.period_range("2016", periods=3, freq="A"),
|
|
}
|
|
)
|
|
|
|
result = df.mean(numeric_only=False)
|
|
expected = pd.Series(
|
|
{"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"], "D": df.loc[1, "D"]}
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_stats_mixed_type(self, float_string_frame):
|
|
# don't blow up
|
|
float_string_frame.std(1)
|
|
float_string_frame.var(1)
|
|
float_string_frame.mean(1)
|
|
float_string_frame.skew(1)
|
|
|
|
def test_sum_bools(self):
|
|
df = DataFrame(index=range(1), columns=range(10))
|
|
bools = isna(df)
|
|
assert bools.sum(axis=1)[0] == 10
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Cumulative Reductions - cumsum, cummax, ...
|
|
|
|
def test_cumsum_corner(self):
|
|
dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5))
|
|
# ?(wesm)
|
|
result = dm.cumsum() # noqa
|
|
|
|
def test_cumsum(self, datetime_frame):
|
|
datetime_frame.loc[5:10, 0] = np.nan
|
|
datetime_frame.loc[10:15, 1] = np.nan
|
|
datetime_frame.loc[15:, 2] = np.nan
|
|
|
|
# axis = 0
|
|
cumsum = datetime_frame.cumsum()
|
|
expected = datetime_frame.apply(Series.cumsum)
|
|
tm.assert_frame_equal(cumsum, expected)
|
|
|
|
# axis = 1
|
|
cumsum = datetime_frame.cumsum(axis=1)
|
|
expected = datetime_frame.apply(Series.cumsum, axis=1)
|
|
tm.assert_frame_equal(cumsum, expected)
|
|
|
|
# works
|
|
df = DataFrame({"A": np.arange(20)}, index=np.arange(20))
|
|
result = df.cumsum() # noqa
|
|
|
|
# fix issue
|
|
cumsum_xs = datetime_frame.cumsum(axis=1)
|
|
assert np.shape(cumsum_xs) == np.shape(datetime_frame)
|
|
|
|
def test_cumprod(self, datetime_frame):
|
|
datetime_frame.loc[5:10, 0] = np.nan
|
|
datetime_frame.loc[10:15, 1] = np.nan
|
|
datetime_frame.loc[15:, 2] = np.nan
|
|
|
|
# axis = 0
|
|
cumprod = datetime_frame.cumprod()
|
|
expected = datetime_frame.apply(Series.cumprod)
|
|
tm.assert_frame_equal(cumprod, expected)
|
|
|
|
# axis = 1
|
|
cumprod = datetime_frame.cumprod(axis=1)
|
|
expected = datetime_frame.apply(Series.cumprod, axis=1)
|
|
tm.assert_frame_equal(cumprod, expected)
|
|
|
|
# fix issue
|
|
cumprod_xs = datetime_frame.cumprod(axis=1)
|
|
assert np.shape(cumprod_xs) == np.shape(datetime_frame)
|
|
|
|
# ints
|
|
df = datetime_frame.fillna(0).astype(int)
|
|
df.cumprod(0)
|
|
df.cumprod(1)
|
|
|
|
# ints32
|
|
df = datetime_frame.fillna(0).astype(np.int32)
|
|
df.cumprod(0)
|
|
df.cumprod(1)
|
|
|
|
def test_cummin(self, datetime_frame):
|
|
datetime_frame.loc[5:10, 0] = np.nan
|
|
datetime_frame.loc[10:15, 1] = np.nan
|
|
datetime_frame.loc[15:, 2] = np.nan
|
|
|
|
# axis = 0
|
|
cummin = datetime_frame.cummin()
|
|
expected = datetime_frame.apply(Series.cummin)
|
|
tm.assert_frame_equal(cummin, expected)
|
|
|
|
# axis = 1
|
|
cummin = datetime_frame.cummin(axis=1)
|
|
expected = datetime_frame.apply(Series.cummin, axis=1)
|
|
tm.assert_frame_equal(cummin, expected)
|
|
|
|
# it works
|
|
df = DataFrame({"A": np.arange(20)}, index=np.arange(20))
|
|
result = df.cummin() # noqa
|
|
|
|
# fix issue
|
|
cummin_xs = datetime_frame.cummin(axis=1)
|
|
assert np.shape(cummin_xs) == np.shape(datetime_frame)
|
|
|
|
def test_cummax(self, datetime_frame):
|
|
datetime_frame.loc[5:10, 0] = np.nan
|
|
datetime_frame.loc[10:15, 1] = np.nan
|
|
datetime_frame.loc[15:, 2] = np.nan
|
|
|
|
# axis = 0
|
|
cummax = datetime_frame.cummax()
|
|
expected = datetime_frame.apply(Series.cummax)
|
|
tm.assert_frame_equal(cummax, expected)
|
|
|
|
# axis = 1
|
|
cummax = datetime_frame.cummax(axis=1)
|
|
expected = datetime_frame.apply(Series.cummax, axis=1)
|
|
tm.assert_frame_equal(cummax, expected)
|
|
|
|
# it works
|
|
df = DataFrame({"A": np.arange(20)}, index=np.arange(20))
|
|
result = df.cummax() # noqa
|
|
|
|
# fix issue
|
|
cummax_xs = datetime_frame.cummax(axis=1)
|
|
assert np.shape(cummax_xs) == np.shape(datetime_frame)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Miscellanea
|
|
|
|
def test_count(self):
|
|
# corner case
|
|
frame = DataFrame()
|
|
ct1 = frame.count(1)
|
|
assert isinstance(ct1, Series)
|
|
|
|
ct2 = frame.count(0)
|
|
assert isinstance(ct2, Series)
|
|
|
|
# GH#423
|
|
df = DataFrame(index=range(10))
|
|
result = df.count(1)
|
|
expected = Series(0, index=df.index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = DataFrame(columns=range(10))
|
|
result = df.count(0)
|
|
expected = Series(0, index=df.columns)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = DataFrame()
|
|
result = df.count()
|
|
expected = Series(0, index=[])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_count_objects(self, float_string_frame):
|
|
dm = DataFrame(float_string_frame._series)
|
|
df = DataFrame(float_string_frame._series)
|
|
|
|
tm.assert_series_equal(dm.count(), df.count())
|
|
tm.assert_series_equal(dm.count(1), df.count(1))
|
|
|
|
def test_pct_change(self):
|
|
# GH#11150
|
|
pnl = DataFrame(
|
|
[np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)]
|
|
).astype(np.float64)
|
|
pnl.iat[1, 0] = np.nan
|
|
pnl.iat[1, 1] = np.nan
|
|
pnl.iat[2, 3] = 60
|
|
|
|
for axis in range(2):
|
|
expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1
|
|
result = pnl.pct_change(axis=axis, fill_method="pad")
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Index of max / min
|
|
|
|
def test_idxmin(self, float_frame, int_frame):
|
|
frame = float_frame
|
|
frame.loc[5:10] = np.nan
|
|
frame.loc[15:20, -2:] = np.nan
|
|
for skipna in [True, False]:
|
|
for axis in [0, 1]:
|
|
for df in [frame, int_frame]:
|
|
result = df.idxmin(axis=axis, skipna=skipna)
|
|
expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
msg = "No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>"
|
|
with pytest.raises(ValueError, match=msg):
|
|
frame.idxmin(axis=2)
|
|
|
|
def test_idxmax(self, float_frame, int_frame):
|
|
frame = float_frame
|
|
frame.loc[5:10] = np.nan
|
|
frame.loc[15:20, -2:] = np.nan
|
|
for skipna in [True, False]:
|
|
for axis in [0, 1]:
|
|
for df in [frame, int_frame]:
|
|
result = df.idxmax(axis=axis, skipna=skipna)
|
|
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
msg = "No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>"
|
|
with pytest.raises(ValueError, match=msg):
|
|
frame.idxmax(axis=2)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Logical reductions
|
|
|
|
@pytest.mark.parametrize("opname", ["any", "all"])
|
|
def test_any_all(self, opname, bool_frame_with_na, float_string_frame):
|
|
assert_bool_op_calc(
|
|
opname, getattr(np, opname), bool_frame_with_na, has_skipna=True
|
|
)
|
|
assert_bool_op_api(
|
|
opname, bool_frame_with_na, float_string_frame, has_bool_only=True
|
|
)
|
|
|
|
def test_any_all_extra(self):
|
|
df = DataFrame(
|
|
{
|
|
"A": [True, False, False],
|
|
"B": [True, True, False],
|
|
"C": [True, True, True],
|
|
},
|
|
index=["a", "b", "c"],
|
|
)
|
|
result = df[["A", "B"]].any(1)
|
|
expected = Series([True, True, False], index=["a", "b", "c"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df[["A", "B"]].any(1, bool_only=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.all(1)
|
|
expected = Series([True, False, False], index=["a", "b", "c"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.all(1, bool_only=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# Axis is None
|
|
result = df.all(axis=None).item()
|
|
assert result is False
|
|
|
|
result = df.any(axis=None).item()
|
|
assert result is True
|
|
|
|
result = df[["C"]].all(axis=None).item()
|
|
assert result is True
|
|
|
|
def test_any_datetime(self):
|
|
|
|
# GH 23070
|
|
float_data = [1, np.nan, 3, np.nan]
|
|
datetime_data = [
|
|
pd.Timestamp("1960-02-15"),
|
|
pd.Timestamp("1960-02-16"),
|
|
pd.NaT,
|
|
pd.NaT,
|
|
]
|
|
df = DataFrame({"A": float_data, "B": datetime_data})
|
|
|
|
result = df.any(1)
|
|
expected = Series([True, True, True, False])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_any_all_bool_only(self):
|
|
|
|
# GH 25101
|
|
df = DataFrame(
|
|
{"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}
|
|
)
|
|
|
|
result = df.all(bool_only=True)
|
|
expected = Series(dtype=np.bool)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = DataFrame(
|
|
{
|
|
"col1": [1, 2, 3],
|
|
"col2": [4, 5, 6],
|
|
"col3": [None, None, None],
|
|
"col4": [False, False, True],
|
|
}
|
|
)
|
|
|
|
result = df.all(bool_only=True)
|
|
expected = Series({"col4": False})
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"func, data, expected",
|
|
[
|
|
(np.any, {}, False),
|
|
(np.all, {}, True),
|
|
(np.any, {"A": []}, False),
|
|
(np.all, {"A": []}, True),
|
|
(np.any, {"A": [False, False]}, False),
|
|
(np.all, {"A": [False, False]}, False),
|
|
(np.any, {"A": [True, False]}, True),
|
|
(np.all, {"A": [True, False]}, False),
|
|
(np.any, {"A": [True, True]}, True),
|
|
(np.all, {"A": [True, True]}, True),
|
|
(np.any, {"A": [False], "B": [False]}, False),
|
|
(np.all, {"A": [False], "B": [False]}, False),
|
|
(np.any, {"A": [False, False], "B": [False, True]}, True),
|
|
(np.all, {"A": [False, False], "B": [False, True]}, False),
|
|
# other types
|
|
(np.all, {"A": pd.Series([0.0, 1.0], dtype="float")}, False),
|
|
(np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True),
|
|
(np.all, {"A": pd.Series([0, 1], dtype=int)}, False),
|
|
(np.any, {"A": pd.Series([0, 1], dtype=int)}, True),
|
|
pytest.param(
|
|
np.all,
|
|
{"A": pd.Series([0, 1], dtype="M8[ns]")},
|
|
False,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.any,
|
|
{"A": pd.Series([0, 1], dtype="M8[ns]")},
|
|
True,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.all,
|
|
{"A": pd.Series([1, 2], dtype="M8[ns]")},
|
|
True,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.any,
|
|
{"A": pd.Series([1, 2], dtype="M8[ns]")},
|
|
True,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.all,
|
|
{"A": pd.Series([0, 1], dtype="m8[ns]")},
|
|
False,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.any,
|
|
{"A": pd.Series([0, 1], dtype="m8[ns]")},
|
|
True,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.all,
|
|
{"A": pd.Series([1, 2], dtype="m8[ns]")},
|
|
True,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
pytest.param(
|
|
np.any,
|
|
{"A": pd.Series([1, 2], dtype="m8[ns]")},
|
|
True,
|
|
marks=[td.skip_if_np_lt("1.15")],
|
|
),
|
|
(np.all, {"A": pd.Series([0, 1], dtype="category")}, False),
|
|
(np.any, {"A": pd.Series([0, 1], dtype="category")}, True),
|
|
(np.all, {"A": pd.Series([1, 2], dtype="category")}, True),
|
|
(np.any, {"A": pd.Series([1, 2], dtype="category")}, True),
|
|
# # Mix
|
|
# GH 21484
|
|
# (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'),
|
|
# 'B': pd.Series([10, 20], dtype='m8[ns]')}, True),
|
|
],
|
|
)
|
|
def test_any_all_np_func(self, func, data, expected):
|
|
# GH 19976
|
|
data = DataFrame(data)
|
|
result = func(data)
|
|
assert isinstance(result, np.bool_)
|
|
assert result.item() is expected
|
|
|
|
# method version
|
|
result = getattr(DataFrame(data), func.__name__)(axis=None)
|
|
assert isinstance(result, np.bool_)
|
|
assert result.item() is expected
|
|
|
|
def test_any_all_object(self):
|
|
# GH 19976
|
|
result = np.all(DataFrame(columns=["a", "b"])).item()
|
|
assert result is True
|
|
|
|
result = np.any(DataFrame(columns=["a", "b"])).item()
|
|
assert result is False
|
|
|
|
@pytest.mark.parametrize("method", ["any", "all"])
|
|
def test_any_all_level_axis_none_raises(self, method):
|
|
df = DataFrame(
|
|
{"A": 1},
|
|
index=MultiIndex.from_product(
|
|
[["A", "B"], ["a", "b"]], names=["out", "in"]
|
|
),
|
|
)
|
|
xpr = "Must specify 'axis' when aggregating by level."
|
|
with pytest.raises(ValueError, match=xpr):
|
|
getattr(df, method)(axis=None, level="out")
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Isin
|
|
|
|
def test_isin(self):
|
|
# GH 4211
|
|
df = DataFrame(
|
|
{
|
|
"vals": [1, 2, 3, 4],
|
|
"ids": ["a", "b", "f", "n"],
|
|
"ids2": ["a", "n", "c", "n"],
|
|
},
|
|
index=["foo", "bar", "baz", "qux"],
|
|
)
|
|
other = ["a", "b", "c"]
|
|
|
|
result = df.isin(other)
|
|
expected = DataFrame([df.loc[s].isin(other) for s in df.index])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("empty", [[], Series(), np.array([])])
|
|
def test_isin_empty(self, empty):
|
|
# GH 16991
|
|
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
|
|
expected = DataFrame(False, df.index, df.columns)
|
|
|
|
result = df.isin(empty)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_dict(self):
|
|
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
|
|
d = {"A": ["a"]}
|
|
|
|
expected = DataFrame(False, df.index, df.columns)
|
|
expected.loc[0, "A"] = True
|
|
|
|
result = df.isin(d)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# non unique columns
|
|
df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
|
|
df.columns = ["A", "A"]
|
|
expected = DataFrame(False, df.index, df.columns)
|
|
expected.loc[0, "A"] = True
|
|
result = df.isin(d)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_with_string_scalar(self):
|
|
# GH 4763
|
|
df = DataFrame(
|
|
{
|
|
"vals": [1, 2, 3, 4],
|
|
"ids": ["a", "b", "f", "n"],
|
|
"ids2": ["a", "n", "c", "n"],
|
|
},
|
|
index=["foo", "bar", "baz", "qux"],
|
|
)
|
|
with pytest.raises(TypeError):
|
|
df.isin("a")
|
|
|
|
with pytest.raises(TypeError):
|
|
df.isin("aaa")
|
|
|
|
def test_isin_df(self):
|
|
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
|
|
df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]})
|
|
expected = DataFrame(False, df1.index, df1.columns)
|
|
result = df1.isin(df2)
|
|
expected["A"].loc[[1, 3]] = True
|
|
expected["B"].loc[[0, 2]] = True
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# partial overlapping columns
|
|
df2.columns = ["A", "C"]
|
|
result = df1.isin(df2)
|
|
expected["B"] = False
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_tuples(self):
|
|
# GH 16394
|
|
df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]})
|
|
df["C"] = list(zip(df["A"], df["B"]))
|
|
result = df["C"].isin([(1, "a")])
|
|
tm.assert_series_equal(result, Series([True, False, False], name="C"))
|
|
|
|
def test_isin_df_dupe_values(self):
|
|
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
|
|
# just cols duped
|
|
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"])
|
|
with pytest.raises(ValueError):
|
|
df1.isin(df2)
|
|
|
|
# just index duped
|
|
df2 = DataFrame(
|
|
[[0, 2], [12, 4], [2, np.nan], [4, 5]],
|
|
columns=["A", "B"],
|
|
index=[0, 0, 1, 1],
|
|
)
|
|
with pytest.raises(ValueError):
|
|
df1.isin(df2)
|
|
|
|
# cols and index:
|
|
df2.columns = ["B", "B"]
|
|
with pytest.raises(ValueError):
|
|
df1.isin(df2)
|
|
|
|
def test_isin_dupe_self(self):
|
|
other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]})
|
|
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"])
|
|
result = df.isin(other)
|
|
expected = DataFrame(False, index=df.index, columns=df.columns)
|
|
expected.loc[0] = True
|
|
expected.iloc[1, 1] = True
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_against_series(self):
|
|
df = pd.DataFrame(
|
|
{"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"]
|
|
)
|
|
s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"])
|
|
expected = DataFrame(False, index=df.index, columns=df.columns)
|
|
expected["A"].loc["a"] = True
|
|
expected.loc["d"] = True
|
|
result = df.isin(s)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_multiIndex(self):
|
|
idx = MultiIndex.from_tuples(
|
|
[
|
|
(0, "a", "foo"),
|
|
(0, "a", "bar"),
|
|
(0, "b", "bar"),
|
|
(0, "b", "baz"),
|
|
(2, "a", "foo"),
|
|
(2, "a", "bar"),
|
|
(2, "c", "bar"),
|
|
(2, "c", "baz"),
|
|
(1, "b", "foo"),
|
|
(1, "b", "bar"),
|
|
(1, "c", "bar"),
|
|
(1, "c", "baz"),
|
|
]
|
|
)
|
|
df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx)
|
|
df2 = DataFrame(
|
|
{
|
|
"A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
|
|
"B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1],
|
|
}
|
|
)
|
|
# against regular index
|
|
expected = DataFrame(False, index=df1.index, columns=df1.columns)
|
|
result = df1.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df2.index = idx
|
|
expected = df2.values.astype(np.bool)
|
|
expected[:, 1] = ~expected[:, 1]
|
|
expected = DataFrame(expected, columns=["A", "B"], index=idx)
|
|
|
|
result = df1.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_empty_datetimelike(self):
|
|
# GH 15473
|
|
df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])})
|
|
df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]})
|
|
df2 = DataFrame({"date": []})
|
|
df3 = DataFrame()
|
|
|
|
expected = DataFrame({"date": [False, False]})
|
|
|
|
result = df1_ts.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df1_ts.isin(df3)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df1_td.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df1_td.isin(df3)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Rounding
|
|
|
|
def test_round(self):
|
|
# GH 2665
|
|
|
|
# Test that rounding an empty DataFrame does nothing
|
|
df = DataFrame()
|
|
tm.assert_frame_equal(df, df.round())
|
|
|
|
# Here's the test frame we'll be working with
|
|
df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]})
|
|
|
|
# Default round to integer (i.e. decimals=0)
|
|
expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]})
|
|
tm.assert_frame_equal(df.round(), expected_rounded)
|
|
|
|
# Round with an integer
|
|
decimals = 2
|
|
expected_rounded = DataFrame(
|
|
{"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]}
|
|
)
|
|
tm.assert_frame_equal(df.round(decimals), expected_rounded)
|
|
|
|
# This should also work with np.round (since np.round dispatches to
|
|
# df.round)
|
|
tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
|
|
|
|
# Round with a list
|
|
round_list = [1, 2]
|
|
with pytest.raises(TypeError):
|
|
df.round(round_list)
|
|
|
|
# Round with a dictionary
|
|
expected_rounded = DataFrame(
|
|
{"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]}
|
|
)
|
|
round_dict = {"col1": 1, "col2": 2}
|
|
tm.assert_frame_equal(df.round(round_dict), expected_rounded)
|
|
|
|
# Incomplete dict
|
|
expected_partially_rounded = DataFrame(
|
|
{"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]}
|
|
)
|
|
partial_round_dict = {"col2": 1}
|
|
tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded)
|
|
|
|
# Dict with unknown elements
|
|
wrong_round_dict = {"col3": 2, "col2": 1}
|
|
tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded)
|
|
|
|
# float input to `decimals`
|
|
non_int_round_dict = {"col1": 1, "col2": 0.5}
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_dict)
|
|
|
|
# String input
|
|
non_int_round_dict = {"col1": 1, "col2": "foo"}
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_dict)
|
|
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
# List input
|
|
non_int_round_dict = {"col1": 1, "col2": [1, 2]}
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_dict)
|
|
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
# Non integer Series inputs
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
# Negative numbers
|
|
negative_round_dict = {"col1": -1, "col2": -2}
|
|
big_df = df * 100
|
|
expected_neg_rounded = DataFrame(
|
|
{"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]}
|
|
)
|
|
tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded)
|
|
|
|
# nan in Series round
|
|
nan_round_Series = Series({"col1": np.nan, "col2": 1})
|
|
|
|
# TODO(wesm): unused?
|
|
expected_nan_round = DataFrame( # noqa
|
|
{"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]}
|
|
)
|
|
|
|
with pytest.raises(TypeError):
|
|
df.round(nan_round_Series)
|
|
|
|
# Make sure this doesn't break existing Series.round
|
|
tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"])
|
|
|
|
# named columns
|
|
# GH 11986
|
|
decimals = 2
|
|
expected_rounded = DataFrame(
|
|
{"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]}
|
|
)
|
|
df.columns.name = "cols"
|
|
expected_rounded.columns.name = "cols"
|
|
tm.assert_frame_equal(df.round(decimals), expected_rounded)
|
|
|
|
# interaction of named columns & series
|
|
tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"])
|
|
tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"])
|
|
|
|
def test_numpy_round(self):
|
|
# GH 12600
|
|
df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
|
|
out = np.round(df, decimals=0)
|
|
expected = DataFrame([[2.0, 1.0], [0.0, 7.0]])
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
msg = "the 'out' parameter is not supported"
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.round(df, decimals=0, out=df)
|
|
|
|
def test_numpy_round_nan(self):
|
|
# See gh-14197
|
|
df = Series([1.53, np.nan, 0.06]).to_frame()
|
|
with tm.assert_produces_warning(None):
|
|
result = df.round()
|
|
expected = Series([2.0, np.nan, 0.0]).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_round_mixed_type(self):
|
|
# GH 11885
|
|
df = DataFrame(
|
|
{
|
|
"col1": [1.1, 2.2, 3.3, 4.4],
|
|
"col2": ["1", "a", "c", "f"],
|
|
"col3": date_range("20111111", periods=4),
|
|
}
|
|
)
|
|
round_0 = DataFrame(
|
|
{
|
|
"col1": [1.0, 2.0, 3.0, 4.0],
|
|
"col2": ["1", "a", "c", "f"],
|
|
"col3": date_range("20111111", periods=4),
|
|
}
|
|
)
|
|
tm.assert_frame_equal(df.round(), round_0)
|
|
tm.assert_frame_equal(df.round(1), df)
|
|
tm.assert_frame_equal(df.round({"col1": 1}), df)
|
|
tm.assert_frame_equal(df.round({"col1": 0}), round_0)
|
|
tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0)
|
|
tm.assert_frame_equal(df.round({"col3": 1}), df)
|
|
|
|
def test_round_issue(self):
|
|
# GH 11611
|
|
|
|
df = pd.DataFrame(
|
|
np.random.random([3, 3]),
|
|
columns=["A", "B", "C"],
|
|
index=["first", "second", "third"],
|
|
)
|
|
|
|
dfs = pd.concat((df, df), axis=1)
|
|
rounded = dfs.round()
|
|
tm.assert_index_equal(rounded.index, dfs.index)
|
|
|
|
decimals = pd.Series([1, 0, 2], index=["A", "B", "A"])
|
|
msg = "Index of decimals must be unique"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.round(decimals)
|
|
|
|
def test_built_in_round(self):
|
|
# GH 11763
|
|
# Here's the test frame we'll be working with
|
|
df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]})
|
|
|
|
# Default round to integer (i.e. decimals=0)
|
|
expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]})
|
|
tm.assert_frame_equal(round(df), expected_rounded)
|
|
|
|
def test_round_nonunique_categorical(self):
|
|
# See GH21809
|
|
idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3)
|
|
df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc"))
|
|
|
|
expected = df.round(3)
|
|
expected.index = idx
|
|
|
|
df_categorical = df.copy().set_index(idx)
|
|
assert df_categorical.shape == (6, 3)
|
|
result = df_categorical.round(3)
|
|
assert result.shape == (6, 3)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Clip
|
|
|
|
def test_clip(self, float_frame):
|
|
median = float_frame.median().median()
|
|
original = float_frame.copy()
|
|
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
capped = float_frame.clip_upper(median)
|
|
assert not (capped.values > median).any()
|
|
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
floored = float_frame.clip_lower(median)
|
|
assert not (floored.values < median).any()
|
|
|
|
double = float_frame.clip(upper=median, lower=median)
|
|
assert not (double.values != median).any()
|
|
|
|
# Verify that float_frame was not changed inplace
|
|
assert (float_frame.values == original.values).all()
|
|
|
|
def test_inplace_clip(self, float_frame):
|
|
# GH 15388
|
|
median = float_frame.median().median()
|
|
frame_copy = float_frame.copy()
|
|
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
frame_copy.clip_upper(median, inplace=True)
|
|
assert not (frame_copy.values > median).any()
|
|
frame_copy = float_frame.copy()
|
|
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
frame_copy.clip_lower(median, inplace=True)
|
|
assert not (frame_copy.values < median).any()
|
|
frame_copy = float_frame.copy()
|
|
|
|
frame_copy.clip(upper=median, lower=median, inplace=True)
|
|
assert not (frame_copy.values != median).any()
|
|
|
|
def test_dataframe_clip(self):
|
|
# GH 2747
|
|
df = DataFrame(np.random.randn(1000, 2))
|
|
|
|
for lb, ub in [(-1, 1), (1, -1)]:
|
|
clipped_df = df.clip(lb, ub)
|
|
|
|
lb, ub = min(lb, ub), max(ub, lb)
|
|
lb_mask = df.values <= lb
|
|
ub_mask = df.values >= ub
|
|
mask = ~lb_mask & ~ub_mask
|
|
assert (clipped_df.values[lb_mask] == lb).all()
|
|
assert (clipped_df.values[ub_mask] == ub).all()
|
|
assert (clipped_df.values[mask] == df.values[mask]).all()
|
|
|
|
def test_clip_mixed_numeric(self):
|
|
# TODO(jreback)
|
|
# clip on mixed integer or floats
|
|
# with integer clippers coerces to float
|
|
df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]})
|
|
result = df.clip(1, 2)
|
|
expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]})
|
|
tm.assert_frame_equal(result, expected, check_like=True)
|
|
|
|
# GH 24162, clipping now preserves numeric types per column
|
|
df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"])
|
|
expected = df.dtypes
|
|
result = df.clip(upper=3).dtypes
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("inplace", [True, False])
|
|
def test_clip_against_series(self, inplace):
|
|
# GH 6966
|
|
|
|
df = DataFrame(np.random.randn(1000, 2))
|
|
lb = Series(np.random.randn(1000))
|
|
ub = lb + 1
|
|
|
|
original = df.copy()
|
|
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
|
|
|
|
if inplace:
|
|
clipped_df = df
|
|
|
|
for i in range(2):
|
|
lb_mask = original.iloc[:, i] <= lb
|
|
ub_mask = original.iloc[:, i] >= ub
|
|
mask = ~lb_mask & ~ub_mask
|
|
|
|
result = clipped_df.loc[lb_mask, i]
|
|
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
|
|
assert result.name == i
|
|
|
|
result = clipped_df.loc[ub_mask, i]
|
|
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
|
|
assert result.name == i
|
|
|
|
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
|
|
|
|
@pytest.mark.parametrize("inplace", [True, False])
|
|
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
|
|
@pytest.mark.parametrize(
|
|
"axis,res",
|
|
[
|
|
(0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]),
|
|
(1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]),
|
|
],
|
|
)
|
|
def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res):
|
|
# GH 15390
|
|
original = simple_frame.copy(deep=True)
|
|
|
|
result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace)
|
|
|
|
expected = pd.DataFrame(res, columns=original.columns, index=original.index)
|
|
if inplace:
|
|
result = original
|
|
tm.assert_frame_equal(result, expected, check_exact=True)
|
|
|
|
@pytest.mark.parametrize("axis", [0, 1, None])
|
|
def test_clip_against_frame(self, axis):
|
|
df = DataFrame(np.random.randn(1000, 2))
|
|
lb = DataFrame(np.random.randn(1000, 2))
|
|
ub = lb + 1
|
|
|
|
clipped_df = df.clip(lb, ub, axis=axis)
|
|
|
|
lb_mask = df <= lb
|
|
ub_mask = df >= ub
|
|
mask = ~lb_mask & ~ub_mask
|
|
|
|
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
|
|
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
|
|
tm.assert_frame_equal(clipped_df[mask], df[mask])
|
|
|
|
def test_clip_against_unordered_columns(self):
|
|
# GH 20911
|
|
df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"])
|
|
df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"])
|
|
df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"])
|
|
result_upper = df1.clip(lower=0, upper=df2)
|
|
expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
|
|
result_lower = df1.clip(lower=df3, upper=3)
|
|
expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
|
|
result_lower_upper = df1.clip(lower=df3, upper=df2)
|
|
expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns])
|
|
tm.assert_frame_equal(result_upper, expected_upper)
|
|
tm.assert_frame_equal(result_lower, expected_lower)
|
|
tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
|
|
|
|
def test_clip_with_na_args(self, float_frame):
|
|
"""Should process np.nan argument as None """
|
|
# GH 17276
|
|
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
|
|
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)
|
|
|
|
# GH 19992
|
|
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
|
|
|
|
result = df.clip(lower=[4, 5, np.nan], axis=0)
|
|
expected = DataFrame(
|
|
{"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]}
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.clip(lower=[4, 5, np.nan], axis=1)
|
|
expected = DataFrame(
|
|
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]}
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Matrix-like
|
|
|
|
def test_dot(self):
|
|
a = DataFrame(
|
|
np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"]
|
|
)
|
|
b = DataFrame(
|
|
np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"]
|
|
)
|
|
|
|
result = a.dot(b)
|
|
expected = DataFrame(
|
|
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
|
)
|
|
# Check alignment
|
|
b1 = b.reindex(index=reversed(b.index))
|
|
result = a.dot(b)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Check series argument
|
|
result = a.dot(b["one"])
|
|
tm.assert_series_equal(result, expected["one"], check_names=False)
|
|
assert result.name is None
|
|
|
|
result = a.dot(b1["one"])
|
|
tm.assert_series_equal(result, expected["one"], check_names=False)
|
|
assert result.name is None
|
|
|
|
# can pass correct-length arrays
|
|
row = a.iloc[0].values
|
|
|
|
result = a.dot(row)
|
|
expected = a.dot(a.iloc[0])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
with pytest.raises(ValueError, match="Dot product shape mismatch"):
|
|
a.dot(row[:-1])
|
|
|
|
a = np.random.rand(1, 5)
|
|
b = np.random.rand(5, 1)
|
|
A = DataFrame(a)
|
|
|
|
# TODO(wesm): unused
|
|
B = DataFrame(b) # noqa
|
|
|
|
# it works
|
|
result = A.dot(b)
|
|
|
|
# unaligned
|
|
df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4))
|
|
df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3])
|
|
|
|
with pytest.raises(ValueError, match="aligned"):
|
|
df.dot(df2)
|
|
|
|
def test_matmul(self):
|
|
# matmul test is for GH 10259
|
|
a = DataFrame(
|
|
np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"]
|
|
)
|
|
b = DataFrame(
|
|
np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"]
|
|
)
|
|
|
|
# DataFrame @ DataFrame
|
|
result = operator.matmul(a, b)
|
|
expected = DataFrame(
|
|
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# DataFrame @ Series
|
|
result = operator.matmul(a, b.one)
|
|
expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# np.array @ DataFrame
|
|
result = operator.matmul(a.values, b)
|
|
assert isinstance(result, DataFrame)
|
|
assert result.columns.equals(b.columns)
|
|
assert result.index.equals(pd.Index(range(3)))
|
|
expected = np.dot(a.values, b.values)
|
|
tm.assert_almost_equal(result.values, expected)
|
|
|
|
# nested list @ DataFrame (__rmatmul__)
|
|
result = operator.matmul(a.values.tolist(), b)
|
|
expected = DataFrame(
|
|
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
|
)
|
|
tm.assert_almost_equal(result.values, expected.values)
|
|
|
|
# mixed dtype DataFrame @ DataFrame
|
|
a["q"] = a.q.round().astype(int)
|
|
result = operator.matmul(a, b)
|
|
expected = DataFrame(
|
|
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# different dtypes DataFrame @ DataFrame
|
|
a = a.astype(int)
|
|
result = operator.matmul(a, b)
|
|
expected = DataFrame(
|
|
np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# unaligned
|
|
df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4))
|
|
df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3])
|
|
|
|
with pytest.raises(ValueError, match="aligned"):
|
|
operator.matmul(df, df2)
|
|
|
|
|
|
@pytest.fixture
|
|
def df_duplicates():
|
|
return pd.DataFrame(
|
|
{"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]},
|
|
index=[0, 0, 1, 1, 1],
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def df_strings():
|
|
return pd.DataFrame(
|
|
{
|
|
"a": np.random.permutation(10),
|
|
"b": list(ascii_lowercase[:10]),
|
|
"c": np.random.permutation(10).astype("float64"),
|
|
}
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def df_main_dtypes():
|
|
return pd.DataFrame(
|
|
{
|
|
"group": [1, 1, 2],
|
|
"int": [1, 2, 3],
|
|
"float": [4.0, 5.0, 6.0],
|
|
"string": list("abc"),
|
|
"category_string": pd.Series(list("abc")).astype("category"),
|
|
"category_int": [7, 8, 9],
|
|
"datetime": pd.date_range("20130101", periods=3),
|
|
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
|
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
|
},
|
|
columns=[
|
|
"group",
|
|
"int",
|
|
"float",
|
|
"string",
|
|
"category_string",
|
|
"category_int",
|
|
"datetime",
|
|
"datetimetz",
|
|
"timedelta",
|
|
],
|
|
)
|
|
|
|
|
|
class TestNLargestNSmallest:
|
|
|
|
dtype_error_msg_template = (
|
|
"Column {column!r} has dtype {dtype}, cannot "
|
|
"use method {method!r} with this dtype"
|
|
)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Top / bottom
|
|
@pytest.mark.parametrize(
|
|
"order",
|
|
[
|
|
["a"],
|
|
["c"],
|
|
["a", "b"],
|
|
["a", "c"],
|
|
["b", "a"],
|
|
["b", "c"],
|
|
["a", "b", "c"],
|
|
["c", "a", "b"],
|
|
["c", "b", "a"],
|
|
["b", "c", "a"],
|
|
["b", "a", "c"],
|
|
# dups!
|
|
["b", "c", "c"],
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("n", range(1, 11))
|
|
def test_n(self, df_strings, nselect_method, n, order):
|
|
# GH 10393
|
|
df = df_strings
|
|
if "b" in order:
|
|
|
|
error_msg = self.dtype_error_msg_template.format(
|
|
column="b", method=nselect_method, dtype="object"
|
|
)
|
|
with pytest.raises(TypeError, match=error_msg):
|
|
getattr(df, nselect_method)(n, order)
|
|
else:
|
|
ascending = nselect_method == "nsmallest"
|
|
result = getattr(df, nselect_method)(n, order)
|
|
expected = df.sort_values(order, ascending=ascending).head(n)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"columns", [["group", "category_string"], ["group", "string"]]
|
|
)
|
|
def test_n_error(self, df_main_dtypes, nselect_method, columns):
|
|
df = df_main_dtypes
|
|
col = columns[1]
|
|
error_msg = self.dtype_error_msg_template.format(
|
|
column=col, method=nselect_method, dtype=df[col].dtype
|
|
)
|
|
# escape some characters that may be in the repr
|
|
error_msg = (
|
|
error_msg.replace("(", "\\(")
|
|
.replace(")", "\\)")
|
|
.replace("[", "\\[")
|
|
.replace("]", "\\]")
|
|
)
|
|
with pytest.raises(TypeError, match=error_msg):
|
|
getattr(df, nselect_method)(2, columns)
|
|
|
|
def test_n_all_dtypes(self, df_main_dtypes):
|
|
df = df_main_dtypes
|
|
df.nsmallest(2, list(set(df) - {"category_string", "string"}))
|
|
df.nlargest(2, list(set(df) - {"category_string", "string"}))
|
|
|
|
@pytest.mark.parametrize(
|
|
"method,expected",
|
|
[
|
|
(
|
|
"nlargest",
|
|
pd.DataFrame(
|
|
{"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3]
|
|
),
|
|
),
|
|
(
|
|
"nsmallest",
|
|
pd.DataFrame(
|
|
{"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0]
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_duplicates_on_starter_columns(self, method, expected):
|
|
# regression test for #22752
|
|
|
|
df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]})
|
|
|
|
result = getattr(df, method)(4, columns=["a", "b"])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_n_identical_values(self):
|
|
# GH 15297
|
|
df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]})
|
|
|
|
result = df.nlargest(3, "a")
|
|
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.nsmallest(3, "a")
|
|
expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"order",
|
|
[["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]],
|
|
)
|
|
@pytest.mark.parametrize("n", range(1, 6))
|
|
def test_n_duplicate_index(self, df_duplicates, n, order):
|
|
# GH 13412
|
|
|
|
df = df_duplicates
|
|
result = df.nsmallest(n, order)
|
|
expected = df.sort_values(order).head(n)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.nlargest(n, order)
|
|
expected = df.sort_values(order, ascending=False).head(n)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_duplicate_keep_all_ties(self):
|
|
# GH 16818
|
|
df = pd.DataFrame(
|
|
{"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]}
|
|
)
|
|
result = df.nlargest(4, "a", keep="all")
|
|
expected = pd.DataFrame(
|
|
{
|
|
"a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3},
|
|
"b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20},
|
|
}
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.nsmallest(2, "a", keep="all")
|
|
expected = pd.DataFrame(
|
|
{
|
|
"a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3},
|
|
"b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20},
|
|
}
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_series_broadcasting(self):
|
|
# smoke test for numpy warnings
|
|
# GH 16378, GH 16306
|
|
df = DataFrame([1.0, 1.0, 1.0])
|
|
df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]})
|
|
s = Series([1, 1, 1])
|
|
s_nan = Series([np.nan, np.nan, 1])
|
|
|
|
with tm.assert_produces_warning(None):
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
df_nan.clip_lower(s, axis=0)
|
|
for op in ["lt", "le", "gt", "ge", "eq", "ne"]:
|
|
getattr(df, op)(s_nan, axis=0)
|
|
|
|
def test_series_nat_conversion(self):
|
|
# GH 18521
|
|
# Check rank does not mutate DataFrame
|
|
df = DataFrame(np.random.randn(10, 3), dtype="float64")
|
|
expected = df.copy()
|
|
df.rank()
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_multiindex_column_lookup(self):
|
|
# Check whether tuples are correctly treated as multi-level lookups.
|
|
# GH 23033
|
|
df = pd.DataFrame(
|
|
columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]),
|
|
data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]],
|
|
)
|
|
|
|
# nsmallest
|
|
result = df.nsmallest(3, ("x", "a"))
|
|
expected = df.iloc[[2, 0, 3]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# nlargest
|
|
result = df.nlargest(3, ("x", "b"))
|
|
expected = df.iloc[[3, 2, 1]]
|
|
tm.assert_frame_equal(result, expected)
|