1578 lines
55 KiB
Python
1578 lines
55 KiB
Python
from itertools import product
|
|
import operator
|
|
|
|
import numpy as np
|
|
from numpy import nan
|
|
import pytest
|
|
|
|
import pandas.util._test_decorators as td
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
Categorical,
|
|
CategoricalIndex,
|
|
DataFrame,
|
|
Series,
|
|
date_range,
|
|
isna,
|
|
notna,
|
|
)
|
|
from pandas.api.types import is_scalar
|
|
from pandas.core.index import MultiIndex
|
|
from pandas.core.indexes.datetimes import Timestamp
|
|
import pandas.util.testing as tm
|
|
from pandas.util.testing import (
|
|
assert_almost_equal,
|
|
assert_frame_equal,
|
|
assert_index_equal,
|
|
assert_series_equal,
|
|
)
|
|
|
|
|
|
class TestSeriesAnalytics:
|
|
def test_describe(self):
|
|
s = Series([0, 1, 2, 3, 4], name="int_data")
|
|
result = s.describe()
|
|
expected = Series(
|
|
[5, 2, s.std(), 0, 1, 2, 3, 4],
|
|
name="int_data",
|
|
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
s = Series([True, True, False, False, False], name="bool_data")
|
|
result = s.describe()
|
|
expected = Series(
|
|
[5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"]
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
s = Series(["a", "a", "b", "c", "d"], name="str_data")
|
|
result = s.describe()
|
|
expected = Series(
|
|
[5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"]
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_describe_empty_object(self):
|
|
# https://github.com/pandas-dev/pandas/issues/27183
|
|
s = pd.Series([None, None], dtype=object)
|
|
result = s.describe()
|
|
expected = pd.Series(
|
|
[0, 0, np.nan, np.nan],
|
|
dtype=object,
|
|
index=["count", "unique", "top", "freq"],
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = s[:0].describe()
|
|
tm.assert_series_equal(result, expected)
|
|
# ensure NaN, not None
|
|
assert np.isnan(result.iloc[2])
|
|
assert np.isnan(result.iloc[3])
|
|
|
|
def test_describe_with_tz(self, tz_naive_fixture):
|
|
# GH 21332
|
|
tz = tz_naive_fixture
|
|
name = str(tz_naive_fixture)
|
|
start = Timestamp(2018, 1, 1)
|
|
end = Timestamp(2018, 1, 5)
|
|
s = Series(date_range(start, end, tz=tz), name=name)
|
|
result = s.describe()
|
|
expected = Series(
|
|
[
|
|
5,
|
|
5,
|
|
s.value_counts().index[0],
|
|
1,
|
|
start.tz_localize(tz),
|
|
end.tz_localize(tz),
|
|
],
|
|
name=name,
|
|
index=["count", "unique", "top", "freq", "first", "last"],
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_argsort(self, datetime_series):
|
|
self._check_accum_op("argsort", datetime_series, check_dtype=False)
|
|
argsorted = datetime_series.argsort()
|
|
assert issubclass(argsorted.dtype.type, np.integer)
|
|
|
|
# GH 2967 (introduced bug in 0.11-dev I think)
|
|
s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)])
|
|
assert s.dtype == "datetime64[ns]"
|
|
shifted = s.shift(-1)
|
|
assert shifted.dtype == "datetime64[ns]"
|
|
assert isna(shifted[4])
|
|
|
|
result = s.argsort()
|
|
expected = Series(range(5), dtype="int64")
|
|
assert_series_equal(result, expected)
|
|
|
|
result = shifted.argsort()
|
|
expected = Series(list(range(4)) + [-1], dtype="int64")
|
|
assert_series_equal(result, expected)
|
|
|
|
def test_argsort_stable(self):
|
|
s = Series(np.random.randint(0, 100, size=10000))
|
|
mindexer = s.argsort(kind="mergesort")
|
|
qindexer = s.argsort()
|
|
|
|
mexpected = np.argsort(s.values, kind="mergesort")
|
|
qexpected = np.argsort(s.values, kind="quicksort")
|
|
|
|
tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False)
|
|
tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False)
|
|
msg = (
|
|
r"ndarray Expected type <class 'numpy\.ndarray'>,"
|
|
r" found <class 'pandas\.core\.series\.Series'> instead"
|
|
)
|
|
with pytest.raises(AssertionError, match=msg):
|
|
tm.assert_numpy_array_equal(qindexer, mindexer)
|
|
|
|
def test_cumsum(self, datetime_series):
|
|
self._check_accum_op("cumsum", datetime_series)
|
|
|
|
def test_cumprod(self, datetime_series):
|
|
self._check_accum_op("cumprod", datetime_series)
|
|
|
|
def test_cummin(self, datetime_series):
|
|
tm.assert_numpy_array_equal(
|
|
datetime_series.cummin().values,
|
|
np.minimum.accumulate(np.array(datetime_series)),
|
|
)
|
|
ts = datetime_series.copy()
|
|
ts[::2] = np.NaN
|
|
result = ts.cummin()[1::2]
|
|
expected = np.minimum.accumulate(ts.dropna())
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_cummax(self, datetime_series):
|
|
tm.assert_numpy_array_equal(
|
|
datetime_series.cummax().values,
|
|
np.maximum.accumulate(np.array(datetime_series)),
|
|
)
|
|
ts = datetime_series.copy()
|
|
ts[::2] = np.NaN
|
|
result = ts.cummax()[1::2]
|
|
expected = np.maximum.accumulate(ts.dropna())
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_cummin_datetime64(self):
|
|
s = pd.Series(
|
|
pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"])
|
|
)
|
|
|
|
expected = pd.Series(
|
|
pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"])
|
|
)
|
|
result = s.cummin(skipna=True)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
expected = pd.Series(
|
|
pd.to_datetime(
|
|
["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"]
|
|
)
|
|
)
|
|
result = s.cummin(skipna=False)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
def test_cummax_datetime64(self):
|
|
s = pd.Series(
|
|
pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"])
|
|
)
|
|
|
|
expected = pd.Series(
|
|
pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"])
|
|
)
|
|
result = s.cummax(skipna=True)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
expected = pd.Series(
|
|
pd.to_datetime(
|
|
["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"]
|
|
)
|
|
)
|
|
result = s.cummax(skipna=False)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
def test_cummin_timedelta64(self):
|
|
s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"]))
|
|
|
|
expected = pd.Series(
|
|
pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "1 min"])
|
|
)
|
|
result = s.cummin(skipna=True)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
expected = pd.Series(
|
|
pd.to_timedelta(["NaT", "2 min", "2 min", "1 min", "1 min", "1 min"])
|
|
)
|
|
result = s.cummin(skipna=False)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
def test_cummax_timedelta64(self):
|
|
s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"]))
|
|
|
|
expected = pd.Series(
|
|
pd.to_timedelta(["NaT", "2 min", "NaT", "2 min", "NaT", "3 min"])
|
|
)
|
|
result = s.cummax(skipna=True)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
expected = pd.Series(
|
|
pd.to_timedelta(["NaT", "2 min", "2 min", "2 min", "2 min", "3 min"])
|
|
)
|
|
result = s.cummax(skipna=False)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
def test_npdiff(self):
|
|
pytest.skip("skipping due to Series no longer being an ndarray")
|
|
|
|
# no longer works as the return type of np.diff is now nd.array
|
|
s = Series(np.arange(5))
|
|
|
|
r = np.diff(s)
|
|
assert_series_equal(Series([nan, 0, 0, 0, nan]), r)
|
|
|
|
def _check_accum_op(self, name, datetime_series_, check_dtype=True):
|
|
func = getattr(np, name)
|
|
tm.assert_numpy_array_equal(
|
|
func(datetime_series_).values,
|
|
func(np.array(datetime_series_)),
|
|
check_dtype=check_dtype,
|
|
)
|
|
|
|
# with missing values
|
|
ts = datetime_series_.copy()
|
|
ts[::2] = np.NaN
|
|
|
|
result = func(ts)[1::2]
|
|
expected = func(np.array(ts.dropna()))
|
|
|
|
tm.assert_numpy_array_equal(result.values, expected, check_dtype=False)
|
|
|
|
def test_compress(self):
|
|
cond = [True, False, True, False, False]
|
|
s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo")
|
|
expected = Series(s.values.compress(cond), index=list("ac"), name="foo")
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
result = s.compress(cond)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_numpy_compress(self):
|
|
cond = [True, False, True, False, False]
|
|
s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo")
|
|
expected = Series(s.values.compress(cond), index=list("ac"), name="foo")
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
tm.assert_series_equal(np.compress(cond, s), expected)
|
|
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
msg = "the 'axis' parameter is not supported"
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.compress(cond, s, axis=1)
|
|
|
|
msg = "the 'out' parameter is not supported"
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.compress(cond, s, out=s)
|
|
|
|
def test_round(self, datetime_series):
|
|
datetime_series.index.name = "index_name"
|
|
result = datetime_series.round(2)
|
|
expected = Series(
|
|
np.round(datetime_series.values, 2), index=datetime_series.index, name="ts"
|
|
)
|
|
assert_series_equal(result, expected)
|
|
assert result.name == datetime_series.name
|
|
|
|
def test_numpy_round(self):
|
|
# See gh-12600
|
|
s = Series([1.53, 1.36, 0.06])
|
|
out = np.round(s, decimals=0)
|
|
expected = Series([2.0, 1.0, 0.0])
|
|
assert_series_equal(out, expected)
|
|
|
|
msg = "the 'out' parameter is not supported"
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.round(s, decimals=0, out=s)
|
|
|
|
def test_numpy_round_nan(self):
|
|
# See gh-14197
|
|
s = Series([1.53, np.nan, 0.06])
|
|
with tm.assert_produces_warning(None):
|
|
result = s.round()
|
|
expected = Series([2.0, np.nan, 0.0])
|
|
assert_series_equal(result, expected)
|
|
|
|
def test_built_in_round(self):
|
|
s = Series([1.123, 2.123, 3.123], index=range(3))
|
|
result = round(s)
|
|
expected_rounded0 = Series([1.0, 2.0, 3.0], index=range(3))
|
|
tm.assert_series_equal(result, expected_rounded0)
|
|
|
|
decimals = 2
|
|
expected_rounded = Series([1.12, 2.12, 3.12], index=range(3))
|
|
result = round(s, decimals)
|
|
tm.assert_series_equal(result, expected_rounded)
|
|
|
|
def test_prod_numpy16_bug(self):
|
|
s = Series([1.0, 1.0, 1.0], index=range(3))
|
|
result = s.prod()
|
|
|
|
assert not isinstance(result, Series)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr(self, datetime_series):
|
|
import scipy.stats as stats
|
|
|
|
# full overlap
|
|
tm.assert_almost_equal(datetime_series.corr(datetime_series), 1)
|
|
|
|
# partial overlap
|
|
tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1)
|
|
|
|
assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12))
|
|
|
|
ts1 = datetime_series[:15].reindex(datetime_series.index)
|
|
ts2 = datetime_series[5:].reindex(datetime_series.index)
|
|
assert isna(ts1.corr(ts2, min_periods=12))
|
|
|
|
# No overlap
|
|
assert np.isnan(datetime_series[::2].corr(datetime_series[1::2]))
|
|
|
|
# all NA
|
|
cp = datetime_series[:10].copy()
|
|
cp[:] = np.nan
|
|
assert isna(cp.corr(cp))
|
|
|
|
A = tm.makeTimeSeries()
|
|
B = tm.makeTimeSeries()
|
|
result = A.corr(B)
|
|
expected, _ = stats.pearsonr(A, B)
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_rank(self):
|
|
import scipy.stats as stats
|
|
|
|
# kendall and spearman
|
|
A = tm.makeTimeSeries()
|
|
B = tm.makeTimeSeries()
|
|
A[-5:] = A[:5]
|
|
result = A.corr(B, method="kendall")
|
|
expected = stats.kendalltau(A, B)[0]
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
result = A.corr(B, method="spearman")
|
|
expected = stats.spearmanr(A, B)[0]
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
# results from R
|
|
A = Series(
|
|
[
|
|
-0.89926396,
|
|
0.94209606,
|
|
-1.03289164,
|
|
-0.95445587,
|
|
0.76910310,
|
|
-0.06430576,
|
|
-2.09704447,
|
|
0.40660407,
|
|
-0.89926396,
|
|
0.94209606,
|
|
]
|
|
)
|
|
B = Series(
|
|
[
|
|
-1.01270225,
|
|
-0.62210117,
|
|
-1.56895827,
|
|
0.59592943,
|
|
-0.01680292,
|
|
1.17258718,
|
|
-1.06009347,
|
|
-0.10222060,
|
|
-0.89076239,
|
|
0.89372375,
|
|
]
|
|
)
|
|
kexp = 0.4319297
|
|
sexp = 0.5853767
|
|
tm.assert_almost_equal(A.corr(B, method="kendall"), kexp)
|
|
tm.assert_almost_equal(A.corr(B, method="spearman"), sexp)
|
|
|
|
def test_corr_invalid_method(self):
|
|
# GH PR #22298
|
|
s1 = pd.Series(np.random.randn(10))
|
|
s2 = pd.Series(np.random.randn(10))
|
|
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
|
|
with pytest.raises(ValueError, match=msg):
|
|
s1.corr(s2, method="____")
|
|
|
|
def test_corr_callable_method(self, datetime_series):
|
|
# simple correlation example
|
|
# returns 1 if exact equality, 0 otherwise
|
|
my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0
|
|
|
|
# simple example
|
|
s1 = Series([1, 2, 3, 4, 5])
|
|
s2 = Series([5, 4, 3, 2, 1])
|
|
expected = 0
|
|
tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected)
|
|
|
|
# full overlap
|
|
tm.assert_almost_equal(
|
|
datetime_series.corr(datetime_series, method=my_corr), 1.0
|
|
)
|
|
|
|
# partial overlap
|
|
tm.assert_almost_equal(
|
|
datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0
|
|
)
|
|
|
|
# No overlap
|
|
assert np.isnan(
|
|
datetime_series[::2].corr(datetime_series[1::2], method=my_corr)
|
|
)
|
|
|
|
# dataframe example
|
|
df = pd.DataFrame([s1, s2])
|
|
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
|
|
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)
|
|
|
|
def test_cov(self, datetime_series):
|
|
# full overlap
|
|
tm.assert_almost_equal(
|
|
datetime_series.cov(datetime_series), datetime_series.std() ** 2
|
|
)
|
|
|
|
# partial overlap
|
|
tm.assert_almost_equal(
|
|
datetime_series[:15].cov(datetime_series[5:]),
|
|
datetime_series[5:15].std() ** 2,
|
|
)
|
|
|
|
# No overlap
|
|
assert np.isnan(datetime_series[::2].cov(datetime_series[1::2]))
|
|
|
|
# all NA
|
|
cp = datetime_series[:10].copy()
|
|
cp[:] = np.nan
|
|
assert isna(cp.cov(cp))
|
|
|
|
# min_periods
|
|
assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12))
|
|
|
|
ts1 = datetime_series[:15].reindex(datetime_series.index)
|
|
ts2 = datetime_series[5:].reindex(datetime_series.index)
|
|
assert isna(ts1.cov(ts2, min_periods=12))
|
|
|
|
def test_count(self, datetime_series):
|
|
assert datetime_series.count() == len(datetime_series)
|
|
|
|
datetime_series[::2] = np.NaN
|
|
|
|
assert datetime_series.count() == np.isfinite(datetime_series).sum()
|
|
|
|
mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, nan, 1, 2]])
|
|
ts = Series(np.arange(len(mi)), index=mi)
|
|
|
|
left = ts.count(level=1)
|
|
right = Series([2, 3, 1], index=[1, 2, nan])
|
|
assert_series_equal(left, right)
|
|
|
|
ts.iloc[[0, 3, 5]] = nan
|
|
assert_series_equal(ts.count(level=1), right - 1)
|
|
|
|
def test_dot(self):
|
|
a = Series(np.random.randn(4), index=["p", "q", "r", "s"])
|
|
b = DataFrame(
|
|
np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"]
|
|
).T
|
|
|
|
result = a.dot(b)
|
|
expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"])
|
|
assert_series_equal(result, expected)
|
|
|
|
# Check index alignment
|
|
b2 = b.reindex(index=reversed(b.index))
|
|
result = a.dot(b)
|
|
assert_series_equal(result, expected)
|
|
|
|
# Check ndarray argument
|
|
result = a.dot(b.values)
|
|
assert np.all(result == expected.values)
|
|
assert_almost_equal(a.dot(b["2"].values), expected["2"])
|
|
|
|
# Check series argument
|
|
assert_almost_equal(a.dot(b["1"]), expected["1"])
|
|
assert_almost_equal(a.dot(b2["1"]), expected["1"])
|
|
|
|
msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)"
|
|
# exception raised is of type Exception
|
|
with pytest.raises(Exception, match=msg):
|
|
a.dot(a.values[:3])
|
|
msg = "matrices are not aligned"
|
|
with pytest.raises(ValueError, match=msg):
|
|
a.dot(b.T)
|
|
|
|
def test_matmul(self):
|
|
# matmul test is for GH #10259
|
|
a = Series(np.random.randn(4), index=["p", "q", "r", "s"])
|
|
b = DataFrame(
|
|
np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"]
|
|
).T
|
|
|
|
# Series @ DataFrame -> Series
|
|
result = operator.matmul(a, b)
|
|
expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"])
|
|
assert_series_equal(result, expected)
|
|
|
|
# DataFrame @ Series -> Series
|
|
result = operator.matmul(b.T, a)
|
|
expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"])
|
|
assert_series_equal(result, expected)
|
|
|
|
# Series @ Series -> scalar
|
|
result = operator.matmul(a, a)
|
|
expected = np.dot(a.values, a.values)
|
|
assert_almost_equal(result, expected)
|
|
|
|
# GH 21530
|
|
# vector (1D np.array) @ Series (__rmatmul__)
|
|
result = operator.matmul(a.values, a)
|
|
expected = np.dot(a.values, a.values)
|
|
assert_almost_equal(result, expected)
|
|
|
|
# GH 21530
|
|
# vector (1D list) @ Series (__rmatmul__)
|
|
result = operator.matmul(a.values.tolist(), a)
|
|
expected = np.dot(a.values, a.values)
|
|
assert_almost_equal(result, expected)
|
|
|
|
# GH 21530
|
|
# matrix (2D np.array) @ Series (__rmatmul__)
|
|
result = operator.matmul(b.T.values, a)
|
|
expected = np.dot(b.T.values, a.values)
|
|
assert_almost_equal(result, expected)
|
|
|
|
# GH 21530
|
|
# matrix (2D nested lists) @ Series (__rmatmul__)
|
|
result = operator.matmul(b.T.values.tolist(), a)
|
|
expected = np.dot(b.T.values, a.values)
|
|
assert_almost_equal(result, expected)
|
|
|
|
# mixed dtype DataFrame @ Series
|
|
a["p"] = int(a.p)
|
|
result = operator.matmul(b.T, a)
|
|
expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"])
|
|
assert_series_equal(result, expected)
|
|
|
|
# different dtypes DataFrame @ Series
|
|
a = a.astype(int)
|
|
result = operator.matmul(b.T, a)
|
|
expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"])
|
|
assert_series_equal(result, expected)
|
|
|
|
msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)"
|
|
# exception raised is of type Exception
|
|
with pytest.raises(Exception, match=msg):
|
|
a.dot(a.values[:3])
|
|
msg = "matrices are not aligned"
|
|
with pytest.raises(ValueError, match=msg):
|
|
a.dot(b.T)
|
|
|
|
def test_clip(self, datetime_series):
|
|
val = datetime_series.median()
|
|
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
assert datetime_series.clip_lower(val).min() == val
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
assert datetime_series.clip_upper(val).max() == val
|
|
|
|
assert datetime_series.clip(lower=val).min() == val
|
|
assert datetime_series.clip(upper=val).max() == val
|
|
|
|
result = datetime_series.clip(-0.5, 0.5)
|
|
expected = np.clip(datetime_series, -0.5, 0.5)
|
|
assert_series_equal(result, expected)
|
|
assert isinstance(expected, Series)
|
|
|
|
def test_clip_types_and_nulls(self):
|
|
|
|
sers = [
|
|
Series([np.nan, 1.0, 2.0, 3.0]),
|
|
Series([None, "a", "b", "c"]),
|
|
Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")),
|
|
]
|
|
|
|
for s in sers:
|
|
thresh = s[2]
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
lower = s.clip_lower(thresh)
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
upper = s.clip_upper(thresh)
|
|
assert lower[notna(lower)].min() == thresh
|
|
assert upper[notna(upper)].max() == thresh
|
|
assert list(isna(s)) == list(isna(lower))
|
|
assert list(isna(s)) == list(isna(upper))
|
|
|
|
def test_clip_with_na_args(self):
|
|
"""Should process np.nan argument as None """
|
|
# GH # 17276
|
|
s = Series([1, 2, 3])
|
|
|
|
assert_series_equal(s.clip(np.nan), Series([1, 2, 3]))
|
|
assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3]))
|
|
|
|
# GH #19992
|
|
assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan]))
|
|
assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1]))
|
|
|
|
def test_clip_against_series(self):
|
|
# GH #6966
|
|
|
|
s = Series([1.0, 1.0, 4.0])
|
|
threshold = Series([1.0, 2.0, 3.0])
|
|
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0]))
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0]))
|
|
|
|
lower = Series([1.0, 2.0, 3.0])
|
|
upper = Series([1.5, 2.5, 3.5])
|
|
|
|
assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5]))
|
|
assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5]))
|
|
|
|
@pytest.mark.parametrize("inplace", [True, False])
|
|
@pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])])
|
|
def test_clip_against_list_like(self, inplace, upper):
|
|
# GH #15390
|
|
original = pd.Series([5, 6, 7])
|
|
result = original.clip(upper=upper, inplace=inplace)
|
|
expected = pd.Series([1, 2, 3])
|
|
|
|
if inplace:
|
|
result = original
|
|
tm.assert_series_equal(result, expected, check_exact=True)
|
|
|
|
def test_clip_with_datetimes(self):
|
|
|
|
# GH 11838
|
|
# naive and tz-aware datetimes
|
|
|
|
t = Timestamp("2015-12-01 09:30:30")
|
|
s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")])
|
|
result = s.clip(upper=t)
|
|
expected = Series(
|
|
[Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")]
|
|
)
|
|
assert_series_equal(result, expected)
|
|
|
|
t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern")
|
|
s = Series(
|
|
[
|
|
Timestamp("2015-12-01 09:30:00", tz="US/Eastern"),
|
|
Timestamp("2015-12-01 09:31:00", tz="US/Eastern"),
|
|
]
|
|
)
|
|
result = s.clip(upper=t)
|
|
expected = Series(
|
|
[
|
|
Timestamp("2015-12-01 09:30:00", tz="US/Eastern"),
|
|
Timestamp("2015-12-01 09:30:30", tz="US/Eastern"),
|
|
]
|
|
)
|
|
assert_series_equal(result, expected)
|
|
|
|
def test_cummethods_bool(self):
|
|
# GH 6270
|
|
|
|
a = pd.Series([False, False, False, True, True, False, False])
|
|
b = ~a
|
|
c = pd.Series([False] * len(b))
|
|
d = ~c
|
|
methods = {
|
|
"cumsum": np.cumsum,
|
|
"cumprod": np.cumprod,
|
|
"cummin": np.minimum.accumulate,
|
|
"cummax": np.maximum.accumulate,
|
|
}
|
|
args = product((a, b, c, d), methods)
|
|
for s, method in args:
|
|
expected = Series(methods[method](s.values))
|
|
result = getattr(s, method)()
|
|
assert_series_equal(result, expected)
|
|
|
|
e = pd.Series([False, True, nan, False])
|
|
cse = pd.Series([0, 1, nan, 1], dtype=object)
|
|
cpe = pd.Series([False, 0, nan, 0])
|
|
cmin = pd.Series([False, False, nan, False])
|
|
cmax = pd.Series([False, True, nan, True])
|
|
expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax}
|
|
|
|
for method in methods:
|
|
res = getattr(e, method)()
|
|
assert_series_equal(res, expecteds[method])
|
|
|
|
def test_isin(self):
|
|
s = Series(["A", "B", "C", "a", "B", "B", "A", "C"])
|
|
|
|
result = s.isin(["A", "C"])
|
|
expected = Series([True, False, True, False, False, False, True, True])
|
|
assert_series_equal(result, expected)
|
|
|
|
# GH: 16012
|
|
# This specific issue has to have a series over 1e6 in len, but the
|
|
# comparison array (in_list) must be large enough so that numpy doesn't
|
|
# do a manual masking trick that will avoid this issue altogether
|
|
s = Series(list("abcdefghijk" * 10 ** 5))
|
|
# If numpy doesn't do the manual comparison/mask, these
|
|
# unorderable mixed types are what cause the exception in numpy
|
|
in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6
|
|
|
|
assert s.isin(in_list).sum() == 200000
|
|
|
|
def test_isin_with_string_scalar(self):
|
|
# GH4763
|
|
s = Series(["A", "B", "C", "a", "B", "B", "A", "C"])
|
|
msg = (
|
|
r"only list-like objects are allowed to be passed to isin\(\),"
|
|
r" you passed a \[str\]"
|
|
)
|
|
with pytest.raises(TypeError, match=msg):
|
|
s.isin("a")
|
|
|
|
s = Series(["aaa", "b", "c"])
|
|
with pytest.raises(TypeError, match=msg):
|
|
s.isin("aaa")
|
|
|
|
def test_isin_with_i8(self):
|
|
# GH 5021
|
|
|
|
expected = Series([True, True, False, False, False])
|
|
expected2 = Series([False, True, False, False, False])
|
|
|
|
# datetime64[ns]
|
|
s = Series(date_range("jan-01-2013", "jan-05-2013"))
|
|
|
|
result = s.isin(s[0:2])
|
|
assert_series_equal(result, expected)
|
|
|
|
result = s.isin(s[0:2].values)
|
|
assert_series_equal(result, expected)
|
|
|
|
# fails on dtype conversion in the first place
|
|
result = s.isin(s[0:2].values.astype("datetime64[D]"))
|
|
assert_series_equal(result, expected)
|
|
|
|
result = s.isin([s[1]])
|
|
assert_series_equal(result, expected2)
|
|
|
|
result = s.isin([np.datetime64(s[1])])
|
|
assert_series_equal(result, expected2)
|
|
|
|
result = s.isin(set(s[0:2]))
|
|
assert_series_equal(result, expected)
|
|
|
|
# timedelta64[ns]
|
|
s = Series(pd.to_timedelta(range(5), unit="d"))
|
|
result = s.isin(s[0:2])
|
|
assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("empty", [[], Series(), np.array([])])
|
|
def test_isin_empty(self, empty):
|
|
# see gh-16991
|
|
s = Series(["a", "b"])
|
|
expected = Series([False, False])
|
|
|
|
result = s.isin(empty)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
def test_ptp(self):
|
|
# GH21614
|
|
N = 1000
|
|
arr = np.random.randn(N)
|
|
ser = Series(arr)
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
assert np.ptp(ser) == np.ptp(arr)
|
|
|
|
# GH11163
|
|
s = Series([3, 5, np.nan, -3, 10])
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
assert s.ptp() == 13
|
|
assert pd.isna(s.ptp(skipna=False))
|
|
|
|
mi = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]])
|
|
s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi)
|
|
|
|
expected = pd.Series([6, 2], index=["a", "b"], dtype=np.float64)
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
tm.assert_series_equal(s.ptp(level=0), expected)
|
|
|
|
expected = pd.Series([np.nan, np.nan], index=["a", "b"])
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
tm.assert_series_equal(s.ptp(level=0, skipna=False), expected)
|
|
|
|
msg = "No axis named 1 for object type <class 'pandas.core.series.Series'>"
|
|
with pytest.raises(ValueError, match=msg):
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
s.ptp(axis=1)
|
|
|
|
s = pd.Series(["a", "b", "c", "d", "e"])
|
|
msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
|
|
with pytest.raises(TypeError, match=msg):
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
s.ptp()
|
|
|
|
msg = r"Series\.ptp does not implement numeric_only\."
|
|
with pytest.raises(NotImplementedError, match=msg):
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
s.ptp(numeric_only=True)
|
|
|
|
def test_repeat(self):
|
|
s = Series(np.random.randn(3), index=["a", "b", "c"])
|
|
|
|
reps = s.repeat(5)
|
|
exp = Series(s.values.repeat(5), index=s.index.values.repeat(5))
|
|
assert_series_equal(reps, exp)
|
|
|
|
to_rep = [2, 3, 4]
|
|
reps = s.repeat(to_rep)
|
|
exp = Series(s.values.repeat(to_rep), index=s.index.values.repeat(to_rep))
|
|
assert_series_equal(reps, exp)
|
|
|
|
def test_numpy_repeat(self):
|
|
s = Series(np.arange(3), name="x")
|
|
expected = Series(s.values.repeat(2), name="x", index=s.index.values.repeat(2))
|
|
assert_series_equal(np.repeat(s, 2), expected)
|
|
|
|
msg = "the 'axis' parameter is not supported"
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.repeat(s, 2, axis=0)
|
|
|
|
def test_searchsorted(self):
|
|
s = Series([1, 2, 3])
|
|
|
|
result = s.searchsorted(1, side="left")
|
|
assert is_scalar(result)
|
|
assert result == 0
|
|
|
|
result = s.searchsorted(1, side="right")
|
|
assert is_scalar(result)
|
|
assert result == 1
|
|
|
|
def test_searchsorted_numeric_dtypes_scalar(self):
|
|
s = Series([1, 2, 90, 1000, 3e9])
|
|
r = s.searchsorted(30)
|
|
assert is_scalar(r)
|
|
assert r == 2
|
|
|
|
r = s.searchsorted([30])
|
|
e = np.array([2], dtype=np.intp)
|
|
tm.assert_numpy_array_equal(r, e)
|
|
|
|
def test_searchsorted_numeric_dtypes_vector(self):
|
|
s = Series([1, 2, 90, 1000, 3e9])
|
|
r = s.searchsorted([91, 2e6])
|
|
e = np.array([3, 4], dtype=np.intp)
|
|
tm.assert_numpy_array_equal(r, e)
|
|
|
|
def test_search_sorted_datetime64_scalar(self):
|
|
s = Series(pd.date_range("20120101", periods=10, freq="2D"))
|
|
v = pd.Timestamp("20120102")
|
|
r = s.searchsorted(v)
|
|
assert is_scalar(r)
|
|
assert r == 1
|
|
|
|
def test_search_sorted_datetime64_list(self):
|
|
s = Series(pd.date_range("20120101", periods=10, freq="2D"))
|
|
v = [pd.Timestamp("20120102"), pd.Timestamp("20120104")]
|
|
r = s.searchsorted(v)
|
|
e = np.array([1, 2], dtype=np.intp)
|
|
tm.assert_numpy_array_equal(r, e)
|
|
|
|
def test_searchsorted_sorter(self):
|
|
# GH8490
|
|
s = Series([3, 1, 2])
|
|
r = s.searchsorted([0, 3], sorter=np.argsort(s))
|
|
e = np.array([0, 2], dtype=np.intp)
|
|
tm.assert_numpy_array_equal(r, e)
|
|
|
|
def test_is_monotonic(self):
|
|
|
|
s = Series(np.random.randint(0, 10, size=1000))
|
|
assert not s.is_monotonic
|
|
s = Series(np.arange(1000))
|
|
assert s.is_monotonic is True
|
|
assert s.is_monotonic_increasing is True
|
|
s = Series(np.arange(1000, 0, -1))
|
|
assert s.is_monotonic_decreasing is True
|
|
|
|
s = Series(pd.date_range("20130101", periods=10))
|
|
assert s.is_monotonic is True
|
|
assert s.is_monotonic_increasing is True
|
|
s = Series(list(reversed(s.tolist())))
|
|
assert s.is_monotonic is False
|
|
assert s.is_monotonic_decreasing is True
|
|
|
|
def test_sort_index_level(self):
|
|
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
|
|
s = Series([1, 2], mi)
|
|
backwards = s.iloc[[1, 0]]
|
|
|
|
res = s.sort_index(level="A")
|
|
assert_series_equal(backwards, res)
|
|
|
|
res = s.sort_index(level=["A", "B"])
|
|
assert_series_equal(backwards, res)
|
|
|
|
res = s.sort_index(level="A", sort_remaining=False)
|
|
assert_series_equal(s, res)
|
|
|
|
res = s.sort_index(level=["A", "B"], sort_remaining=False)
|
|
assert_series_equal(s, res)
|
|
|
|
def test_apply_categorical(self):
|
|
values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
|
|
s = pd.Series(values, name="XX", index=list("abcdefg"))
|
|
result = s.apply(lambda x: x.lower())
|
|
|
|
# should be categorical dtype when the number of categories are
|
|
# the same
|
|
values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
|
|
exp = pd.Series(values, name="XX", index=list("abcdefg"))
|
|
tm.assert_series_equal(result, exp)
|
|
tm.assert_categorical_equal(result.values, exp.values)
|
|
|
|
result = s.apply(lambda x: "A")
|
|
exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg"))
|
|
tm.assert_series_equal(result, exp)
|
|
assert result.dtype == np.object
|
|
|
|
def test_shift_int(self, datetime_series):
|
|
ts = datetime_series.astype(int)
|
|
shifted = ts.shift(1)
|
|
expected = ts.astype(float).shift(1)
|
|
assert_series_equal(shifted, expected)
|
|
|
|
def test_shift_categorical(self):
|
|
# GH 9416
|
|
s = pd.Series(["a", "b", "c", "d"], dtype="category")
|
|
|
|
assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna())
|
|
|
|
sp1 = s.shift(1)
|
|
assert_index_equal(s.index, sp1.index)
|
|
assert np.all(sp1.values.codes[:1] == -1)
|
|
assert np.all(s.values.codes[:-1] == sp1.values.codes[1:])
|
|
|
|
sn2 = s.shift(-2)
|
|
assert_index_equal(s.index, sn2.index)
|
|
assert np.all(sn2.values.codes[-2:] == -1)
|
|
assert np.all(s.values.codes[2:] == sn2.values.codes[:-2])
|
|
|
|
assert_index_equal(s.values.categories, sp1.values.categories)
|
|
assert_index_equal(s.values.categories, sn2.values.categories)
|
|
|
|
def test_unstack(self):
|
|
from numpy import nan
|
|
|
|
index = MultiIndex(
|
|
levels=[["bar", "foo"], ["one", "three", "two"]],
|
|
codes=[[1, 1, 0, 0], [0, 1, 0, 2]],
|
|
)
|
|
|
|
s = Series(np.arange(4.0), index=index)
|
|
unstacked = s.unstack()
|
|
|
|
expected = DataFrame(
|
|
[[2.0, nan, 3.0], [0.0, 1.0, nan]],
|
|
index=["bar", "foo"],
|
|
columns=["one", "three", "two"],
|
|
)
|
|
|
|
assert_frame_equal(unstacked, expected)
|
|
|
|
unstacked = s.unstack(level=0)
|
|
assert_frame_equal(unstacked, expected.T)
|
|
|
|
index = MultiIndex(
|
|
levels=[["bar"], ["one", "two", "three"], [0, 1]],
|
|
codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
|
|
)
|
|
s = Series(np.random.randn(6), index=index)
|
|
exp_index = MultiIndex(
|
|
levels=[["one", "two", "three"], [0, 1]],
|
|
codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
|
|
)
|
|
expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0)
|
|
unstacked = s.unstack(0).sort_index()
|
|
assert_frame_equal(unstacked, expected)
|
|
|
|
# GH5873
|
|
idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]])
|
|
ts = pd.Series([1, 2], index=idx)
|
|
left = ts.unstack()
|
|
right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], columns=[nan, 3.5])
|
|
assert_frame_equal(left, right)
|
|
|
|
idx = pd.MultiIndex.from_arrays(
|
|
[
|
|
["cat", "cat", "cat", "dog", "dog"],
|
|
["a", "a", "b", "a", "b"],
|
|
[1, 2, 1, 1, np.nan],
|
|
]
|
|
)
|
|
ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx)
|
|
right = DataFrame(
|
|
[[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], columns=["cat", "dog"]
|
|
)
|
|
tpls = [("a", 1), ("a", 2), ("b", nan), ("b", 1)]
|
|
right.index = pd.MultiIndex.from_tuples(tpls)
|
|
assert_frame_equal(ts.unstack(level=0), right)
|
|
|
|
def test_value_counts_datetime(self):
|
|
# most dtypes are tested in test_base.py
|
|
values = [
|
|
pd.Timestamp("2011-01-01 09:00"),
|
|
pd.Timestamp("2011-01-01 10:00"),
|
|
pd.Timestamp("2011-01-01 11:00"),
|
|
pd.Timestamp("2011-01-01 09:00"),
|
|
pd.Timestamp("2011-01-01 09:00"),
|
|
pd.Timestamp("2011-01-01 11:00"),
|
|
]
|
|
|
|
exp_idx = pd.DatetimeIndex(
|
|
["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"]
|
|
)
|
|
exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx")
|
|
|
|
s = pd.Series(values, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(), exp)
|
|
# check DatetimeIndex outputs the same result
|
|
idx = pd.DatetimeIndex(values, name="xxx")
|
|
tm.assert_series_equal(idx.value_counts(), exp)
|
|
|
|
# normalize
|
|
exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(normalize=True), exp)
|
|
tm.assert_series_equal(idx.value_counts(normalize=True), exp)
|
|
|
|
def test_value_counts_datetime_tz(self):
|
|
values = [
|
|
pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
|
|
pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"),
|
|
pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"),
|
|
pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
|
|
pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
|
|
pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"),
|
|
]
|
|
|
|
exp_idx = pd.DatetimeIndex(
|
|
["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"],
|
|
tz="US/Eastern",
|
|
)
|
|
exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx")
|
|
|
|
s = pd.Series(values, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(), exp)
|
|
idx = pd.DatetimeIndex(values, name="xxx")
|
|
tm.assert_series_equal(idx.value_counts(), exp)
|
|
|
|
exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(normalize=True), exp)
|
|
tm.assert_series_equal(idx.value_counts(normalize=True), exp)
|
|
|
|
def test_value_counts_period(self):
|
|
values = [
|
|
pd.Period("2011-01", freq="M"),
|
|
pd.Period("2011-02", freq="M"),
|
|
pd.Period("2011-03", freq="M"),
|
|
pd.Period("2011-01", freq="M"),
|
|
pd.Period("2011-01", freq="M"),
|
|
pd.Period("2011-03", freq="M"),
|
|
]
|
|
|
|
exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M")
|
|
exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx")
|
|
|
|
s = pd.Series(values, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(), exp)
|
|
# check DatetimeIndex outputs the same result
|
|
idx = pd.PeriodIndex(values, name="xxx")
|
|
tm.assert_series_equal(idx.value_counts(), exp)
|
|
|
|
# normalize
|
|
exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(normalize=True), exp)
|
|
tm.assert_series_equal(idx.value_counts(normalize=True), exp)
|
|
|
|
def test_value_counts_categorical_ordered(self):
|
|
# most dtypes are tested in test_base.py
|
|
values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True)
|
|
|
|
exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True)
|
|
exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx")
|
|
|
|
s = pd.Series(values, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(), exp)
|
|
# check CategoricalIndex outputs the same result
|
|
idx = pd.CategoricalIndex(values, name="xxx")
|
|
tm.assert_series_equal(idx.value_counts(), exp)
|
|
|
|
# normalize
|
|
exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(normalize=True), exp)
|
|
tm.assert_series_equal(idx.value_counts(normalize=True), exp)
|
|
|
|
def test_value_counts_categorical_not_ordered(self):
|
|
values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False)
|
|
|
|
exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False)
|
|
exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx")
|
|
|
|
s = pd.Series(values, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(), exp)
|
|
# check CategoricalIndex outputs the same result
|
|
idx = pd.CategoricalIndex(values, name="xxx")
|
|
tm.assert_series_equal(idx.value_counts(), exp)
|
|
|
|
# normalize
|
|
exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx")
|
|
tm.assert_series_equal(s.value_counts(normalize=True), exp)
|
|
tm.assert_series_equal(idx.value_counts(normalize=True), exp)
|
|
|
|
@pytest.mark.parametrize("func", [np.any, np.all])
|
|
@pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())])
|
|
@td.skip_if_np_lt("1.15")
|
|
def test_validate_any_all_out_keepdims_raises(self, kwargs, func):
|
|
s = pd.Series([1, 2])
|
|
param = list(kwargs)[0]
|
|
name = func.__name__
|
|
|
|
msg = (
|
|
r"the '{arg}' parameter is not "
|
|
r"supported in the pandas "
|
|
r"implementation of {fname}\(\)"
|
|
).format(arg=param, fname=name)
|
|
with pytest.raises(ValueError, match=msg):
|
|
func(s, **kwargs)
|
|
|
|
@td.skip_if_np_lt("1.15")
|
|
def test_validate_sum_initial(self):
|
|
s = pd.Series([1, 2])
|
|
msg = (
|
|
r"the 'initial' parameter is not "
|
|
r"supported in the pandas "
|
|
r"implementation of sum\(\)"
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.sum(s, initial=10)
|
|
|
|
def test_validate_median_initial(self):
|
|
s = pd.Series([1, 2])
|
|
msg = (
|
|
r"the 'overwrite_input' parameter is not "
|
|
r"supported in the pandas "
|
|
r"implementation of median\(\)"
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
# It seems like np.median doesn't dispatch, so we use the
|
|
# method instead of the ufunc.
|
|
s.median(overwrite_input=True)
|
|
|
|
@td.skip_if_np_lt("1.15")
|
|
def test_validate_stat_keepdims(self):
|
|
s = pd.Series([1, 2])
|
|
msg = (
|
|
r"the 'keepdims' parameter is not "
|
|
r"supported in the pandas "
|
|
r"implementation of sum\(\)"
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
np.sum(s, keepdims=True)
|
|
|
|
def test_compound_deprecated(self):
|
|
s = Series([0.1, 0.2, 0.3, 0.4])
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
s.compound()
|
|
|
|
df = pd.DataFrame({"s": s})
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
df.compound()
|
|
|
|
|
|
main_dtypes = [
|
|
"datetime",
|
|
"datetimetz",
|
|
"timedelta",
|
|
"int8",
|
|
"int16",
|
|
"int32",
|
|
"int64",
|
|
"float32",
|
|
"float64",
|
|
"uint8",
|
|
"uint16",
|
|
"uint32",
|
|
"uint64",
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def s_main_dtypes():
|
|
"""A DataFrame with many dtypes
|
|
|
|
* datetime
|
|
* datetimetz
|
|
* timedelta
|
|
* [u]int{8,16,32,64}
|
|
* float{32,64}
|
|
|
|
The columns are the name of the dtype.
|
|
"""
|
|
df = pd.DataFrame(
|
|
{
|
|
"datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]),
|
|
"datetimetz": pd.to_datetime(
|
|
["2003", "2002", "2001", "2002", "2005"]
|
|
).tz_localize("US/Eastern"),
|
|
"timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]),
|
|
}
|
|
)
|
|
|
|
for dtype in [
|
|
"int8",
|
|
"int16",
|
|
"int32",
|
|
"int64",
|
|
"float32",
|
|
"float64",
|
|
"uint8",
|
|
"uint16",
|
|
"uint32",
|
|
"uint64",
|
|
]:
|
|
df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype)
|
|
|
|
return df
|
|
|
|
|
|
@pytest.fixture(params=main_dtypes)
|
|
def s_main_dtypes_split(request, s_main_dtypes):
|
|
"""Each series in s_main_dtypes."""
|
|
return s_main_dtypes[request.param]
|
|
|
|
|
|
def assert_check_nselect_boundary(vals, dtype, method):
|
|
# helper function for 'test_boundary_{dtype}' tests
|
|
s = Series(vals, dtype=dtype)
|
|
result = getattr(s, method)(3)
|
|
expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1]
|
|
expected = s.loc[expected_idxr]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
class TestNLargestNSmallest:
|
|
@pytest.mark.parametrize(
|
|
"r",
|
|
[
|
|
Series([3.0, 2, 1, 2, "5"], dtype="object"),
|
|
Series([3.0, 2, 1, 2, 5], dtype="object"),
|
|
# not supported on some archs
|
|
# Series([3., 2, 1, 2, 5], dtype='complex256'),
|
|
Series([3.0, 2, 1, 2, 5], dtype="complex128"),
|
|
Series(list("abcde")),
|
|
Series(list("abcde"), dtype="category"),
|
|
],
|
|
)
|
|
def test_error(self, r):
|
|
dt = r.dtype
|
|
msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt)
|
|
args = 2, len(r), 0, -1
|
|
methods = r.nlargest, r.nsmallest
|
|
for method, arg in product(methods, args):
|
|
with pytest.raises(TypeError, match=msg):
|
|
method(arg)
|
|
|
|
def test_nsmallest_nlargest(self, s_main_dtypes_split):
|
|
# float, int, datetime64 (use i8), timedelts64 (same),
|
|
# object that are numbers, object that are strings
|
|
s = s_main_dtypes_split
|
|
|
|
assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
|
|
assert_series_equal(s.nsmallest(2, keep="last"), s.iloc[[2, 3]])
|
|
|
|
empty = s.iloc[0:0]
|
|
assert_series_equal(s.nsmallest(0), empty)
|
|
assert_series_equal(s.nsmallest(-1), empty)
|
|
assert_series_equal(s.nlargest(0), empty)
|
|
assert_series_equal(s.nlargest(-1), empty)
|
|
|
|
assert_series_equal(s.nsmallest(len(s)), s.sort_values())
|
|
assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values())
|
|
assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
|
|
assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]])
|
|
|
|
def test_misc(self):
|
|
|
|
s = Series([3.0, np.nan, 1, 2, 5])
|
|
assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
|
|
assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
|
|
|
|
msg = 'keep must be either "first", "last"'
|
|
with pytest.raises(ValueError, match=msg):
|
|
s.nsmallest(keep="invalid")
|
|
with pytest.raises(ValueError, match=msg):
|
|
s.nlargest(keep="invalid")
|
|
|
|
# GH 15297
|
|
s = Series([1] * 5, index=[1, 2, 3, 4, 5])
|
|
expected_first = Series([1] * 3, index=[1, 2, 3])
|
|
expected_last = Series([1] * 3, index=[5, 4, 3])
|
|
|
|
result = s.nsmallest(3)
|
|
assert_series_equal(result, expected_first)
|
|
|
|
result = s.nsmallest(3, keep="last")
|
|
assert_series_equal(result, expected_last)
|
|
|
|
result = s.nlargest(3)
|
|
assert_series_equal(result, expected_first)
|
|
|
|
result = s.nlargest(3, keep="last")
|
|
assert_series_equal(result, expected_last)
|
|
|
|
@pytest.mark.parametrize("n", range(1, 5))
|
|
def test_n(self, n):
|
|
|
|
# GH 13412
|
|
s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
|
|
result = s.nlargest(n)
|
|
expected = s.sort_values(ascending=False).head(n)
|
|
assert_series_equal(result, expected)
|
|
|
|
result = s.nsmallest(n)
|
|
expected = s.sort_values().head(n)
|
|
assert_series_equal(result, expected)
|
|
|
|
def test_boundary_integer(self, nselect_method, any_int_dtype):
|
|
# GH 21426
|
|
dtype_info = np.iinfo(any_int_dtype)
|
|
min_val, max_val = dtype_info.min, dtype_info.max
|
|
vals = [min_val, min_val + 1, max_val - 1, max_val]
|
|
assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)
|
|
|
|
def test_boundary_float(self, nselect_method, float_dtype):
|
|
# GH 21426
|
|
dtype_info = np.finfo(float_dtype)
|
|
min_val, max_val = dtype_info.min, dtype_info.max
|
|
min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype)
|
|
vals = [min_val, min_2nd, max_2nd, max_val]
|
|
assert_check_nselect_boundary(vals, float_dtype, nselect_method)
|
|
|
|
@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
|
|
def test_boundary_datetimelike(self, nselect_method, dtype):
|
|
# GH 21426
|
|
# use int64 bounds and +1 to min_val since true minimum is NaT
|
|
# (include min_val/NaT at end to maintain same expected_idxr)
|
|
dtype_info = np.iinfo("int64")
|
|
min_val, max_val = dtype_info.min, dtype_info.max
|
|
vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
|
|
assert_check_nselect_boundary(vals, dtype, nselect_method)
|
|
|
|
def test_duplicate_keep_all_ties(self):
|
|
# see gh-16818
|
|
s = Series([10, 9, 8, 7, 7, 7, 7, 6])
|
|
result = s.nlargest(4, keep="all")
|
|
expected = Series([10, 9, 8, 7, 7, 7, 7])
|
|
assert_series_equal(result, expected)
|
|
|
|
result = s.nsmallest(2, keep="all")
|
|
expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6])
|
|
assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"data,expected", [([True, False], [True]), ([True, False, True, True], [True])]
|
|
)
|
|
def test_boolean(self, data, expected):
|
|
# GH 26154 : ensure True > False
|
|
s = Series(data)
|
|
result = s.nlargest(1)
|
|
expected = Series(expected)
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
class TestCategoricalSeriesAnalytics:
|
|
def test_count(self):
|
|
|
|
s = Series(
|
|
Categorical(
|
|
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
|
|
)
|
|
)
|
|
result = s.count()
|
|
assert result == 2
|
|
|
|
def test_value_counts(self):
|
|
# GH 12835
|
|
cats = Categorical(list("abcccb"), categories=list("cabd"))
|
|
s = Series(cats, name="xxx")
|
|
res = s.value_counts(sort=False)
|
|
|
|
exp_index = CategoricalIndex(list("cabd"), categories=cats.categories)
|
|
exp = Series([3, 1, 2, 0], name="xxx", index=exp_index)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = s.value_counts(sort=True)
|
|
|
|
exp_index = CategoricalIndex(list("cbad"), categories=cats.categories)
|
|
exp = Series([3, 2, 1, 0], name="xxx", index=exp_index)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# check object dtype handles the Series.name as the same
|
|
# (tested in test_base.py)
|
|
s = Series(["a", "b", "c", "c", "c", "b"], name="xxx")
|
|
res = s.value_counts()
|
|
exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"])
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
def test_value_counts_with_nan(self):
|
|
# see gh-9443
|
|
|
|
# sanity check
|
|
s = Series(["a", "b", "a"], dtype="category")
|
|
exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
|
|
|
|
res = s.value_counts(dropna=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = s.value_counts(dropna=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# same Series via two different constructions --> same behaviour
|
|
series = [
|
|
Series(["a", "b", None, "a", None, None], dtype="category"),
|
|
Series(
|
|
Categorical(["a", "b", None, "a", None, None], categories=["a", "b"])
|
|
),
|
|
]
|
|
|
|
for s in series:
|
|
# None is a NaN value, so we exclude its count here
|
|
exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
|
|
res = s.value_counts(dropna=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# we don't exclude the count of None and sort by counts
|
|
exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
|
|
res = s.value_counts(dropna=False)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# When we aren't sorting by counts, and np.nan isn't a
|
|
# category, it should be last.
|
|
exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
|
|
res = s.value_counts(dropna=False, sort=False)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype",
|
|
[
|
|
"int_",
|
|
"uint",
|
|
"float_",
|
|
"unicode_",
|
|
"timedelta64[h]",
|
|
pytest.param(
|
|
"datetime64[D]", marks=pytest.mark.xfail(reason="GH#7996", strict=False)
|
|
),
|
|
],
|
|
)
|
|
def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture):
|
|
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
|
|
|
|
# Test case 1
|
|
input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
|
|
tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture))
|
|
|
|
expected = Series([False, False, False, True])
|
|
tm.assert_series_equal(tc1.duplicated(), expected)
|
|
tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
|
|
sc = tc1.copy()
|
|
sc.drop_duplicates(inplace=True)
|
|
tm.assert_series_equal(sc, tc1[~expected])
|
|
|
|
expected = Series([False, False, True, False])
|
|
tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
|
|
tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected])
|
|
sc = tc1.copy()
|
|
sc.drop_duplicates(keep="last", inplace=True)
|
|
tm.assert_series_equal(sc, tc1[~expected])
|
|
|
|
expected = Series([False, False, True, True])
|
|
tm.assert_series_equal(tc1.duplicated(keep=False), expected)
|
|
tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
|
|
sc = tc1.copy()
|
|
sc.drop_duplicates(keep=False, inplace=True)
|
|
tm.assert_series_equal(sc, tc1[~expected])
|
|
|
|
# Test case 2
|
|
input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
|
|
tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture))
|
|
|
|
expected = Series([False, False, False, False, True, True, False])
|
|
tm.assert_series_equal(tc2.duplicated(), expected)
|
|
tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
|
|
sc = tc2.copy()
|
|
sc.drop_duplicates(inplace=True)
|
|
tm.assert_series_equal(sc, tc2[~expected])
|
|
|
|
expected = Series([False, True, True, False, False, False, False])
|
|
tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
|
|
tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected])
|
|
sc = tc2.copy()
|
|
sc.drop_duplicates(keep="last", inplace=True)
|
|
tm.assert_series_equal(sc, tc2[~expected])
|
|
|
|
expected = Series([False, True, True, False, True, True, False])
|
|
tm.assert_series_equal(tc2.duplicated(keep=False), expected)
|
|
tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
|
|
sc = tc2.copy()
|
|
sc.drop_duplicates(keep=False, inplace=True)
|
|
tm.assert_series_equal(sc, tc2[~expected])
|
|
|
|
def test_drop_duplicates_categorical_bool(self, ordered_fixture):
|
|
tc = Series(
|
|
Categorical(
|
|
[True, False, True, False],
|
|
categories=[True, False],
|
|
ordered=ordered_fixture,
|
|
)
|
|
)
|
|
|
|
expected = Series([False, False, True, True])
|
|
tm.assert_series_equal(tc.duplicated(), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
|
|
sc = tc.copy()
|
|
sc.drop_duplicates(inplace=True)
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
|
|
expected = Series([True, True, False, False])
|
|
tm.assert_series_equal(tc.duplicated(keep="last"), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected])
|
|
sc = tc.copy()
|
|
sc.drop_duplicates(keep="last", inplace=True)
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
|
|
expected = Series([True, True, True, True])
|
|
tm.assert_series_equal(tc.duplicated(keep=False), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
|
|
sc = tc.copy()
|
|
sc.drop_duplicates(keep=False, inplace=True)
|
|
tm.assert_series_equal(sc, tc[~expected])
|