8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,8 @@
from pandas import Categorical
class TestCategorical:
def setup_method(self, method):
self.factor = Categorical(
["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True
)

View File

@@ -0,0 +1,7 @@
import pytest
@pytest.fixture(params=[True, False])
def allow_fill(request):
"""Boolean 'allow_fill' parameter for Categorical.take"""
return request.param

View File

@@ -0,0 +1,142 @@
import numpy as np
import pytest
import pandas as pd
import pandas.util.testing as tm
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]])
def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
labels, uniques = pd.factorize(cat)
expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort_ordered():
cat = pd.Categorical(
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_isin_cats():
# GH2003
cat = pd.Categorical(["a", "b", np.nan])
result = cat.isin(["a", np.nan])
expected = np.array([True, False, True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
result = cat.isin(["a", "c"])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
expected = np.array([False, False], dtype=bool)
result = s.isin(empty)
tm.assert_numpy_array_equal(expected, result)
class TestTake:
# https://github.com/pandas-dev/pandas/issues/20664
def test_take_warns(self):
cat = pd.Categorical(["a", "b"])
with tm.assert_produces_warning(FutureWarning):
cat.take([0, -1])
def test_take_positive_no_warning(self):
cat = pd.Categorical(["a", "b"])
with tm.assert_produces_warning(None):
cat.take([0, 0])
def test_take_bounds(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = pd.Categorical(["a", "b", "a"])
with pytest.raises(IndexError):
cat.take([4, 5], allow_fill=allow_fill)
def test_take_empty(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = pd.Categorical([], categories=["a", "b"])
with pytest.raises(IndexError):
cat.take([0], allow_fill=allow_fill)
def test_positional_take(self, ordered_fixture):
cat = pd.Categorical(
["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered_fixture
)
result = cat.take([0, 1, 2], allow_fill=False)
expected = pd.Categorical(
["a", "a", "b"], categories=cat.categories, ordered=ordered_fixture
)
tm.assert_categorical_equal(result, expected)
def test_positional_take_unobserved(self, ordered_fixture):
cat = pd.Categorical(
["a", "b"], categories=["a", "b", "c"], ordered=ordered_fixture
)
result = cat.take([1, 0], allow_fill=False)
expected = pd.Categorical(
["b", "a"], categories=cat.categories, ordered=ordered_fixture
)
tm.assert_categorical_equal(result, expected)
def test_take_allow_fill(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = pd.Categorical(["a", "a", "b"])
result = cat.take([0, -1, -1], allow_fill=True)
expected = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_with_negative_one(self):
# -1 was a category
cat = pd.Categorical([-1, 0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = pd.Categorical(["a", "b", "c"])
result = cat.take([0, 1, -1], fill_value="a", allow_fill=True)
expected = pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = pd.Categorical(["a", "b", "c"])
xpr = r"'fill_value' \('d'\) is not in this Categorical's categories."
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)

View File

@@ -0,0 +1,316 @@
import sys
import numpy as np
import pytest
from pandas.compat import PYPY
from pandas import Categorical, Index, Series
from pandas.api.types import is_scalar
import pandas.util.testing as tm
class TestCategoricalAnalytics:
def test_min_max(self):
# unordered cats have no min/max
cat = Categorical(["a", "b", "c", "d"], ordered=False)
msg = "Categorical is not ordered for operation {}"
with pytest.raises(TypeError, match=msg.format("min")):
cat.min()
with pytest.raises(TypeError, match=msg.format("max")):
cat.max()
cat = Categorical(["a", "b", "c", "d"], ordered=True)
_min = cat.min()
_max = cat.max()
assert _min == "a"
assert _max == "d"
cat = Categorical(
["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
)
_min = cat.min()
_max = cat.max()
assert _min == "d"
assert _max == "a"
cat = Categorical(
[np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True
)
_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _max == "b"
_min = cat.min(numeric_only=True)
assert _min == "c"
_max = cat.max(numeric_only=True)
assert _max == "b"
cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
_min = cat.min()
_max = cat.max()
assert np.isnan(_min)
assert _max == 1
_min = cat.min(numeric_only=True)
assert _min == 2
_max = cat.max(numeric_only=True)
assert _max == 1
@pytest.mark.parametrize(
"values,categories,exp_mode",
[
([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
],
)
def test_mode(self, values, categories, exp_mode):
s = Categorical(values, categories=categories, ordered=True)
res = s.mode()
exp = Categorical(exp_mode, categories=categories, ordered=True)
tm.assert_categorical_equal(res, exp)
def test_searchsorted(self):
# https://github.com/pandas-dev/pandas/issues/8420
# https://github.com/pandas-dev/pandas/issues/14522
c1 = Categorical(
["cheese", "milk", "apple", "bread", "bread"],
categories=["cheese", "milk", "apple", "bread"],
ordered=True,
)
s1 = Series(c1)
c2 = Categorical(
["cheese", "milk", "apple", "bread", "bread"],
categories=["cheese", "milk", "apple", "bread"],
ordered=False,
)
s2 = Series(c2)
# Searching for single item argument, side='left' (default)
res_cat = c1.searchsorted("apple")
assert res_cat == 2
assert is_scalar(res_cat)
res_ser = s1.searchsorted("apple")
assert res_ser == 2
assert is_scalar(res_ser)
# Searching for single item array, side='left' (default)
res_cat = c1.searchsorted(["bread"])
res_ser = s1.searchsorted(["bread"])
exp = np.array([3], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for several items array, side='right'
res_cat = c1.searchsorted(["apple", "bread"], side="right")
res_ser = s1.searchsorted(["apple", "bread"], side="right")
exp = np.array([3, 5], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for a single value that is not from the Categorical
msg = r"Value\(s\) to be inserted must be in categories"
with pytest.raises(KeyError, match=msg):
c1.searchsorted("cucumber")
with pytest.raises(KeyError, match=msg):
s1.searchsorted("cucumber")
# Searching for multiple values one of each is not from the Categorical
with pytest.raises(KeyError, match=msg):
c1.searchsorted(["bread", "cucumber"])
with pytest.raises(KeyError, match=msg):
s1.searchsorted(["bread", "cucumber"])
# searchsorted call for unordered Categorical
msg = "Categorical not ordered"
with pytest.raises(ValueError, match=msg):
c2.searchsorted("apple")
with pytest.raises(ValueError, match=msg):
s2.searchsorted("apple")
def test_unique(self):
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b"])
exp = Index(["a", "b"])
res = cat.unique()
tm.assert_index_equal(res.categories, exp)
tm.assert_categorical_equal(res, cat)
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
res = cat.unique()
tm.assert_index_equal(res.categories, exp)
tm.assert_categorical_equal(res, Categorical(exp))
cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
exp = Index(["c", "a", "b"])
res = cat.unique()
tm.assert_index_equal(res.categories, exp)
exp_cat = Categorical(exp, categories=["c", "a", "b"])
tm.assert_categorical_equal(res, exp_cat)
# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
res = cat.unique()
exp = Index(["b", "a"])
tm.assert_index_equal(res.categories, exp)
exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
tm.assert_categorical_equal(res, exp_cat)
def test_unique_ordered(self):
# keep categories order when ordered=True
cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True)
res = cat.unique()
exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
tm.assert_categorical_equal(res, exp_cat)
cat = Categorical(
["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True
)
res = cat.unique()
exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(res, exp_cat)
cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True)
res = cat.unique()
exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
tm.assert_categorical_equal(res, exp_cat)
cat = Categorical(
["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True
)
res = cat.unique()
exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
tm.assert_categorical_equal(res, exp_cat)
def test_unique_index_series(self):
c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
# Categorical.unique sorts categories by appearance order
# if ordered=False
exp = Categorical([3, 1, 2], categories=[3, 1, 2])
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
exp = Categorical([1, 2], categories=[1, 2])
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
# Categorical.unique keeps categories order if ordered=True
exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
def test_shift(self):
# GH 9416
cat = Categorical(["a", "b", "c", "d", "a"])
# shift forward
sp1 = cat.shift(1)
xp1 = Categorical([np.nan, "a", "b", "c", "d"])
tm.assert_categorical_equal(sp1, xp1)
tm.assert_categorical_equal(cat[:-1], sp1[1:])
# shift back
sn2 = cat.shift(-2)
xp2 = Categorical(
["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
)
tm.assert_categorical_equal(sn2, xp2)
tm.assert_categorical_equal(cat[2:], sn2[:-2])
# shift by zero
tm.assert_categorical_equal(cat, cat.shift(0))
def test_nbytes(self):
cat = Categorical([1, 2, 3])
exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
assert cat.nbytes == exp
def test_memory_usage(self):
cat = Categorical([1, 2, 3])
# .categories is an index, so we include the hashtable
assert 0 < cat.nbytes <= cat.memory_usage()
assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
cat = Categorical(["foo", "foo", "bar"])
assert cat.memory_usage(deep=True) > cat.nbytes
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
def test_map(self):
c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = c.map(lambda x: x.lower())
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
result = c.map(lambda x: x.lower())
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
tm.assert_categorical_equal(result, exp)
result = c.map(lambda x: 1)
# GH 12766: Return an index not an array
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
def test_validate_inplace(self):
cat = Categorical(["A", "B", "B", "C", "A"])
invalid_values = [1, "True", [1, 2, 3], 5.0]
for value in invalid_values:
with pytest.raises(ValueError):
cat.set_ordered(value=True, inplace=value)
with pytest.raises(ValueError):
cat.as_ordered(inplace=value)
with pytest.raises(ValueError):
cat.as_unordered(inplace=value)
with pytest.raises(ValueError):
cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value)
with pytest.raises(ValueError):
cat.rename_categories(["X", "Y", "Z"], inplace=value)
with pytest.raises(ValueError):
cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value)
with pytest.raises(ValueError):
cat.add_categories(new_categories=["D", "E", "F"], inplace=value)
with pytest.raises(ValueError):
cat.remove_categories(removals=["D", "E", "F"], inplace=value)
with pytest.raises(ValueError):
cat.remove_unused_categories(inplace=value)
with pytest.raises(ValueError):
cat.sort_values(inplace=value)
def test_isna(self):
exp = np.array([False, False, True])
c = Categorical(["a", "b", np.nan])
res = c.isna()
tm.assert_numpy_array_equal(res, exp)

View File

@@ -0,0 +1,506 @@
import numpy as np
import pytest
from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
from pandas.core.arrays.categorical import _recode_for_categories
from pandas.tests.arrays.categorical.common import TestCategorical
import pandas.util.testing as tm
class TestCategoricalAPI:
def test_ordered_api(self):
# GH 9347
cat1 = Categorical(list("acb"), ordered=False)
tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
assert not cat1.ordered
cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
assert not cat2.ordered
cat3 = Categorical(list("acb"), ordered=True)
tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
assert cat3.ordered
cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
assert cat4.ordered
def test_set_ordered(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
cat2 = cat.as_unordered()
assert not cat2.ordered
cat2 = cat.as_ordered()
assert cat2.ordered
cat2.as_unordered(inplace=True)
assert not cat2.ordered
cat2.as_ordered(inplace=True)
assert cat2.ordered
assert cat2.set_ordered(True).ordered
assert not cat2.set_ordered(False).ordered
cat2.set_ordered(True, inplace=True)
assert cat2.ordered
cat2.set_ordered(False, inplace=True)
assert not cat2.ordered
# removed in 0.19.0
msg = "can't set attribute"
with pytest.raises(AttributeError, match=msg):
cat.ordered = True
with pytest.raises(AttributeError, match=msg):
cat.ordered = False
def test_rename_categories(self):
cat = Categorical(["a", "b", "c", "a"])
# inplace=False: the old one must not be changed
res = cat.rename_categories([1, 2, 3])
tm.assert_numpy_array_equal(
res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(res.categories, Index([1, 2, 3]))
exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
exp_cat = Index(["a", "b", "c"])
tm.assert_index_equal(cat.categories, exp_cat)
# GH18862 (let rename_categories take callables)
result = cat.rename_categories(lambda x: x.upper())
expected = Categorical(["A", "B", "C", "A"])
tm.assert_categorical_equal(result, expected)
# and now inplace
res = cat.rename_categories([1, 2, 3], inplace=True)
assert res is None
tm.assert_numpy_array_equal(
cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
# Lengthen
with pytest.raises(ValueError):
cat.rename_categories([1, 2, 3, 4])
# Shorten
with pytest.raises(ValueError):
cat.rename_categories([1, 2])
def test_rename_categories_series(self):
# https://github.com/pandas-dev/pandas/issues/17981
c = Categorical(["a", "b"])
result = c.rename_categories(Series([0, 1], index=["a", "b"]))
expected = Categorical([0, 1])
tm.assert_categorical_equal(result, expected)
def test_rename_categories_dict(self):
# GH 17336
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
expected = Index([4, 3, 2, 1])
tm.assert_index_equal(res.categories, expected)
# Test for inplace
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True)
assert res is None
tm.assert_index_equal(cat.categories, expected)
# Test for dicts of smaller length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "c": 3})
expected = Index([1, "b", 3, "d"])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with bigger length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
expected = Index([1, 2, 3, 4])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with no items from old categories
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"f": 1, "g": 3})
expected = Index(["a", "b", "c", "d"])
tm.assert_index_equal(res.categories, expected)
def test_reorder_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
)
# first inplace == False
res = cat.reorder_categories(["c", "b", "a"])
# cat must be the same as before
tm.assert_categorical_equal(cat, old)
# only res is changed
tm.assert_categorical_equal(res, new)
# inplace == True
res = cat.reorder_categories(["c", "b", "a"], inplace=True)
assert res is None
tm.assert_categorical_equal(cat, new)
# not all "old" included in "new"
cat = Categorical(["a", "b", "c", "a"], ordered=True)
with pytest.raises(ValueError):
cat.reorder_categories(["a"])
# still not all "old" in "new"
with pytest.raises(ValueError):
cat.reorder_categories(["a", "b", "d"])
# all "old" included in "new", but too long
with pytest.raises(ValueError):
cat.reorder_categories(["a", "b", "c", "d"])
def test_add_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
)
# first inplace == False
res = cat.add_categories("d")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.add_categories(["d"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# inplace == True
res = cat.add_categories("d", inplace=True)
tm.assert_categorical_equal(cat, new)
assert res is None
# new is in old categories
with pytest.raises(ValueError):
cat.add_categories(["d"])
# GH 9927
cat = Categorical(list("abc"), ordered=True)
expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
# test with Series, np.array, index, list
res = cat.add_categories(Series(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(np.array(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(Index(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(["d", "e"])
tm.assert_categorical_equal(res, expected)
def test_set_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
exp_categories = Index(["c", "b", "a"])
exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
res = cat.set_categories(["c", "b", "a"], inplace=True)
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
assert res is None
res = cat.set_categories(["a", "b", "c"])
# cat must be the same as before
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
# only res is changed
exp_categories_back = Index(["a", "b", "c"])
tm.assert_index_equal(res.categories, exp_categories_back)
tm.assert_numpy_array_equal(res.__array__(), exp_values)
# not all "old" included in "new" -> all not included ones are now
# np.nan
cat = Categorical(["a", "b", "c", "a"], ordered=True)
res = cat.set_categories(["a"])
tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
# still not all "old" in "new"
res = cat.set_categories(["a", "b", "d"])
tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
# all "old" included in "new"
cat = cat.set_categories(["a", "b", "c", "d"])
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_index_equal(cat.categories, exp_categories)
# internals...
c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(c.to_dense(), exp)
# all "pointers" to '4' must be changed from 3 to 0,...
c = c.set_categories([4, 3, 2, 1])
# positions are changed
tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
# categories are now in new order
tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
# output is the same
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(c.to_dense(), exp)
assert c.min() == 4
assert c.max() == 1
# set_categories should set the ordering if specified
c2 = c.set_categories([4, 3, 2, 1], ordered=False)
assert not c2.ordered
tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense())
# set_categories should pass thru the ordering
c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
assert not c2.ordered
tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense())
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_set_categories_rename_less(self):
# GH 24675
cat = Categorical(["A", "B"])
result = cat.set_categories(["A"], rename=True)
expected = Categorical(["A", np.nan])
tm.assert_categorical_equal(result, expected)
def test_set_categories_private(self):
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"])
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
# fastpath
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"], fastpath=True)
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
def test_remove_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
# first inplace == False
res = cat.remove_categories("c")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.remove_categories(["c"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# inplace == True
res = cat.remove_categories("c", inplace=True)
tm.assert_categorical_equal(cat, new)
assert res is None
# removal is not in categories
with pytest.raises(ValueError):
cat.remove_categories(["c"])
def test_remove_unused_categories(self):
c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
exp_categories_all = Index(["a", "b", "c", "d", "e"])
exp_categories_dropped = Index(["a", "b", "c", "d"])
tm.assert_index_equal(c.categories, exp_categories_all)
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, exp_categories_dropped)
tm.assert_index_equal(c.categories, exp_categories_all)
res = c.remove_unused_categories(inplace=True)
tm.assert_index_equal(c.categories, exp_categories_dropped)
assert res is None
# with NaN values (GH11599)
c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(res.codes, exp_codes)
tm.assert_index_equal(c.categories, exp_categories_all)
val = ["F", np.nan, "D", "B", "D", "F", np.nan]
cat = Categorical(values=val, categories=list("ABCDEFG"))
out = cat.remove_unused_categories()
tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(out.codes, exp_codes)
assert out.tolist() == val
alpha = list("abcdefghijklmnopqrstuvwxyz")
val = np.random.choice(alpha[::2], 10000).astype("object")
val[np.random.choice(len(val), 100)] = np.nan
cat = Categorical(values=val, categories=alpha)
out = cat.remove_unused_categories()
assert out.tolist() == val.tolist()
class TestCategoricalAPIWithFactor(TestCategorical):
def test_describe(self):
# string type
desc = self.factor.describe()
assert self.factor.ordered
exp_index = CategoricalIndex(
["a", "b", "c"], name="categories", ordered=self.factor.ordered
)
expected = DataFrame(
{"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
)
tm.assert_frame_equal(desc, expected)
# check unused categories
cat = self.factor.copy()
cat.set_categories(["a", "b", "c", "d"], inplace=True)
desc = cat.describe()
exp_index = CategoricalIndex(
list("abcd"), ordered=self.factor.ordered, name="categories"
)
expected = DataFrame(
{"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# check an integer one
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
desc = cat.describe()
exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
expected = DataFrame(
{"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# https://github.com/pandas-dev/pandas/issues/3678
# describe should work with NaN
cat = Categorical([np.nan, 1, 2, 2])
desc = cat.describe()
expected = DataFrame(
{"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
index=CategoricalIndex(
[1, 2, np.nan], categories=[1, 2], name="categories"
),
)
tm.assert_frame_equal(desc, expected)
def test_set_categories_inplace(self):
cat = self.factor.copy()
cat.set_categories(["a", "b", "c", "d"], inplace=True)
tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"]))
class TestPrivateCategoricalAPI:
def test_codes_immutable(self):
# Codes should be read only
c = Categorical(["a", "b", "c", "a", np.nan])
exp = np.array([0, 1, 2, 0, -1], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
# Assignments to codes should raise
with pytest.raises(ValueError):
c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
# changes in the codes array should raise
codes = c.codes
with pytest.raises(ValueError):
codes[4] = 1
# But even after getting the codes, the original array should still be
# writeable!
c[4] = "a"
exp = np.array([0, 1, 2, 0, 0], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
c._codes[4] = 2
exp = np.array([0, 1, 2, 0, 2], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
@pytest.mark.parametrize(
"codes, old, new, expected",
[
([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
([-1, -1], [], ["a", "b"], [-1, -1]),
([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
],
)
def test_recode_to_categories(self, codes, old, new, expected):
codes = np.asanyarray(codes, dtype=np.int8)
expected = np.asanyarray(expected, dtype=np.int8)
old = Index(old)
new = Index(new)
result = _recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self):
N = 1000
codes = np.arange(N)
old = Index(codes)
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
new = Index(expected)
result = _recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)
def test_deprecated_get_values(self):
cat = Categorical(["a", "b", "c", "a"])
with tm.assert_produces_warning(FutureWarning):
res = cat.get_values()
tm.assert_numpy_array_equal(res, np.array(cat))

View File

@@ -0,0 +1,603 @@
from datetime import datetime
import numpy as np
import pytest
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
NaT,
Series,
Timestamp,
date_range,
period_range,
timedelta_range,
)
import pandas.util.testing as tm
class TestCategoricalConstructors:
def test_validate_ordered(self):
# see gh-14058
exp_msg = "'ordered' must either be 'True' or 'False'"
exp_err = TypeError
# This should be a boolean.
ordered = np.array([0, 1, 2])
with pytest.raises(exp_err, match=exp_msg):
Categorical([1, 2, 3], ordered=ordered)
with pytest.raises(exp_err, match=exp_msg):
Categorical.from_codes(
[0, 0, 1], categories=["a", "b", "c"], ordered=ordered
)
def test_constructor_empty(self):
# GH 17248
c = Categorical([])
expected = Index([])
tm.assert_index_equal(c.categories, expected)
c = Categorical([], categories=[1, 2, 3])
expected = pd.Int64Index([1, 2, 3])
tm.assert_index_equal(c.categories, expected)
def test_constructor_empty_boolean(self):
# see gh-22702
cat = pd.Categorical([], categories=[True, False])
categories = sorted(cat.categories.tolist())
assert categories == [False, True]
def test_constructor_tuples(self):
values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
result = Categorical(values)
expected = Index([(1,), (1, 2)], tupleize_cols=False)
tm.assert_index_equal(result.categories, expected)
assert result.ordered is False
def test_constructor_tuples_datetimes(self):
# numpy will auto reshape when all of the tuples are the
# same len, so add an extra one with 2 items and slice it off
values = np.array(
[
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
("a", "b"),
],
dtype=object,
)[:-1]
result = Categorical(values)
expected = Index(
[(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
tupleize_cols=False,
)
tm.assert_index_equal(result.categories, expected)
def test_constructor_unsortable(self):
# it works!
arr = np.array([1, 2, 3, datetime.now()], dtype="O")
factor = Categorical(arr, ordered=False)
assert not factor.ordered
# this however will raise as cannot be sorted
msg = (
"'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument."
)
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)
def test_constructor_interval(self):
result = Categorical(
[Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
)
ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
exp = Categorical(ii, ordered=True)
tm.assert_categorical_equal(result, exp)
tm.assert_index_equal(result.categories, ii)
def test_constructor(self):
exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
c1 = Categorical(exp_arr)
tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["c", "b", "a"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
# categories must be unique
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])
# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
assert not c1.ordered
# Categorical as input
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
# Series of dtype category
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
# Series
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(Series(["a", "b", "c", "a"]))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(c1, c2)
# This should result in integer categories, not float!
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
assert is_integer_dtype(cat.categories)
# https://github.com/pandas-dev/pandas/issues/3678
cat = Categorical([np.nan, 1, 2, 3])
assert is_integer_dtype(cat.categories)
# this should result in floats
cat = Categorical([np.nan, 1, 2.0, 3])
assert is_float_dtype(cat.categories)
cat = Categorical([np.nan, 1.0, 2.0, 3.0])
assert is_float_dtype(cat.categories)
# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...
# vals = np.asarray(cat[cat.notna()])
# assert is_integer_dtype(vals)
# corner cases
cat = Categorical([1])
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
cat = Categorical(["a"])
assert len(cat.categories) == 1
assert cat.categories[0] == "a"
assert len(cat.codes) == 1
assert cat.codes[0] == 0
# Scalars should be converted to lists
cat = Categorical(1)
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa
with tm.assert_produces_warning(None):
c_old = Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # noqa
# the next one are from the old docs
with tm.assert_produces_warning(None):
c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa
cat = Categorical([1, 2], categories=[1, 2, 3])
# this is a legitimate constructor
with tm.assert_produces_warning(None):
c = Categorical( # noqa
np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True
)
def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)
def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(["a", "b"], categories="a")
def test_constructor_with_null(self):
# Cannot have NaN in categories
msg = "Categorial categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical(
DatetimeIndex(["nat", "20160101"]),
categories=[NaT, Timestamp("20160101")],
)
def test_constructor_with_index(self):
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(ci.values, Categorical(ci))
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(
ci.values, Categorical(ci.astype(object), categories=ci.categories)
)
def test_constructor_with_generator(self):
# This was raising an Error in isna(single_val).any() because isna
# returned a scalar for a generator
xrange = range
exp = Categorical([0, 1, 2])
cat = Categorical((x for x in [0, 1, 2]))
tm.assert_categorical_equal(cat, exp)
cat = Categorical(xrange(3))
tm.assert_categorical_equal(cat, exp)
# This uses xrange internally
from pandas.core.index import MultiIndex
MultiIndex.from_product([range(5), ["a", "b", "c"]])
# check that categories accept generators and sequences
cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
tm.assert_categorical_equal(cat, exp)
cat = Categorical([0, 1, 2], categories=xrange(3))
tm.assert_categorical_equal(cat, exp)
@pytest.mark.parametrize(
"dtl",
[
date_range("1995-01-01 00:00:00", periods=5, freq="s"),
date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
timedelta_range("1 day", periods=5, freq="s"),
],
)
def test_constructor_with_datetimelike(self, dtl):
# see gh-12077
# constructor with a datetimelike and NaT
s = Series(dtl)
c = Categorical(s)
expected = type(dtl)(s)
expected.freq = None
tm.assert_index_equal(c.categories, expected)
tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
# with NaT
s2 = s.copy()
s2.iloc[-1] = NaT
c = Categorical(s2)
expected = type(dtl)(s2.dropna())
expected.freq = None
tm.assert_index_equal(c.categories, expected)
exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
tm.assert_numpy_array_equal(c.codes, exp)
result = repr(c)
assert "NaT" in result
def test_constructor_from_index_series_datetimetz(self):
idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_from_index_series_timedelta(self):
idx = timedelta_range("1 days", freq="D", periods=3)
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_from_index_series_period(self):
idx = period_range("2015-01-01", freq="D", periods=3)
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_invariant(self):
# GH 14190
vals = [
np.array([1.0, 1.2, 1.8, np.nan]),
np.array([1, 2, 3], dtype="int64"),
["a", "b", "c", np.nan],
[pd.Period("2014-01"), pd.Period("2014-02"), NaT],
[Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
[
Timestamp("2014-01-01", tz="US/Eastern"),
Timestamp("2014-01-02", tz="US/Eastern"),
NaT,
],
]
for val in vals:
c = Categorical(val)
c2 = Categorical(c)
tm.assert_categorical_equal(c, c2)
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_with_dtype(self, ordered):
categories = ["b", "a", "c"]
dtype = CategoricalDtype(categories, ordered=ordered)
result = Categorical(["a", "b", "a", "c"], dtype=dtype)
expected = Categorical(
["a", "b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
assert result.ordered is ordered
def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(["a", "b"], ordered=True)
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=True, dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=False, dtype=dtype)
@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_with_unknown_dtype(self):
dtype = CategoricalDtype(None, ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use values.categories, not dtype.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "d"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_constructor_with_categorical_categories(self):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
def test_from_codes(self):
# too few categories
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)
# no int codes
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)
# no unique categories
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
# NaN categories included
with pytest.raises(ValueError, match="Categorial categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
# too negative
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)
exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)
def test_from_codes_with_categorical_categories(self):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical.from_codes([0, 1], categories=Categorical(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
result = Categorical.from_codes(
[0, 1], categories=CategoricalIndex(["a", "b", "c"])
)
tm.assert_categorical_equal(result, expected)
# non-unique Categorical still raises
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], Categorical(["a", "b", "a"]))
def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
dtype = CategoricalDtype(categories=["a", "b", "c"])
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
def test_from_codes_with_float(self):
# GH21767
codes = [1.0, 2.0, 0] # integer, but in float dtype
dtype = CategoricalDtype(categories=["a", "b", "c"])
with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, dtype.categories)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1"))
with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, dtype=dtype)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1"))
codes = [1.1, 2.0, 0] # non-integer
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
def test_from_codes_with_dtype_raises(self):
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
)
def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories(self, dtype):
cats = ["a", "b"]
codes = np.array([0, 0, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes(codes, cats)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories_sorts(self, dtype):
cats = ["b", "a"]
codes = np.array([0, 1, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("ordered", [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)
@pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
def test_constructor_imaginary(self):
values = [1, 2, 3 + 1j]
c1 = Categorical(values)
tm.assert_index_equal(c1.categories, Index(values))
tm.assert_numpy_array_equal(np.array(c1), np.array(values))

View File

@@ -0,0 +1,181 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp
import pandas.util.testing as tm
class TestCategoricalDtypes:
def test_is_equal_dtype(self):
# test dtype comparisons between cats
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1.is_dtype_equal(c1)
assert c2.is_dtype_equal(c2)
assert c3.is_dtype_equal(c3)
assert c1.is_dtype_equal(c2)
assert not c1.is_dtype_equal(c3)
assert not c1.is_dtype_equal(Index(list("aabca")))
assert not c1.is_dtype_equal(c1.astype(object))
assert c1.is_dtype_equal(CategoricalIndex(c1))
assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab")))
assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1.is_dtype_equal(s1)
assert c2.is_dtype_equal(s2)
assert c3.is_dtype_equal(s3)
assert c1.is_dtype_equal(s2)
assert not c1.is_dtype_equal(s3)
assert not c1.is_dtype_equal(s1.astype(object))
def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(["a", "b", "c"]))
tm.assert_categorical_equal(result, c)
def test_set_dtype_new_categories(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(list("abcd")))
tm.assert_numpy_array_equal(result.codes, c.codes)
tm.assert_index_equal(result.dtype.categories, Index(list("abcd")))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_dtype_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c._set_dtype(expected.dtype)
tm.assert_categorical_equal(result, expected)
def test_set_dtype_no_overlap(self):
c = Categorical(["a", "b", "c"], ["d", "e"])
result = c._set_dtype(CategoricalDtype(["a", "b"]))
expected = Categorical([None, None, None], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_codes_dtypes(self):
# GH 8453
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = Categorical(["foo{i:05d}".format(i=i) for i in range(400)])
assert result.codes.dtype == "int16"
result = Categorical(["foo{i:05d}".format(i=i) for i in range(40000)])
assert result.codes.dtype == "int32"
# adding cats
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = result.add_categories(["foo{i:05d}".format(i=i) for i in range(400)])
assert result.codes.dtype == "int16"
# removing cats
result = result.remove_categories(
["foo{i:05d}".format(i=i) for i in range(300)]
)
assert result.codes.dtype == "int8"
@pytest.mark.parametrize("ordered", [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list("abbaaccc"), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)
msg = "could not convert string to float"
with pytest.raises(ValueError, match=msg):
cat.astype(float)
# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(int)
expected = np.array(cat, dtype=np.int)
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(float)
expected = np.array(cat, dtype=np.float)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype_ordered", [True, False])
@pytest.mark.parametrize("cat_ordered", [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH 10696/18593
data = list("abcaacbab")
cat = Categorical(data, categories=list("bac"), ordered=cat_ordered)
# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)
# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype("category")
expected = cat
tm.assert_categorical_equal(result, expected)
def test_astype_category_ordered_none_deprecated(self):
# GH 26336
cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True)
cdt2 = CategoricalDtype(categories=list("cedafb"))
cat = Categorical(list("abcdaba"), dtype=cdt1)
with tm.assert_produces_warning(FutureWarning):
cat.astype(cdt2)
def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
assert isinstance(list(cat)[0], int)
assert isinstance(cat.tolist()[0], int)
def test_iter_python_types_datetime(self):
cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")])
assert isinstance(list(cat)[0], Timestamp)
assert isinstance(cat.tolist()[0], Timestamp)

View File

@@ -0,0 +1,279 @@
import numpy as np
import pytest
import pandas as pd
from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series
import pandas.core.common as com
from pandas.tests.arrays.categorical.common import TestCategorical
import pandas.util.testing as tm
class TestCategoricalIndexingWithFactor(TestCategorical):
def test_getitem(self):
assert self.factor[0] == "a"
assert self.factor[-1] == "c"
subf = self.factor[[0, 1, 2]]
tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8))
subf = self.factor[np.asarray(self.factor) == "c"]
tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8))
def test_setitem(self):
# int/positional
c = self.factor.copy()
c[0] = "b"
assert c[0] == "b"
c[-1] = "a"
assert c[-1] == "a"
# boolean
c = self.factor.copy()
indexer = np.zeros(len(c), dtype="bool")
indexer[0] = True
indexer[-1] = True
c[indexer] = "c"
expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(c, expected)
@pytest.mark.parametrize(
"other",
[pd.Categorical(["b", "a"]), pd.Categorical(["b", "a"], categories=["b", "a"])],
)
def test_setitem_same_but_unordered(self, other):
# GH-24142
target = pd.Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
target[mask] = other[mask]
expected = pd.Categorical(["b", "b"], categories=["a", "b"])
tm.assert_categorical_equal(target, expected)
@pytest.mark.parametrize(
"other",
[
pd.Categorical(["b", "a"], categories=["b", "a", "c"]),
pd.Categorical(["b", "a"], categories=["a", "b", "c"]),
pd.Categorical(["a", "a"], categories=["a"]),
pd.Categorical(["b", "b"], categories=["b"]),
],
)
def test_setitem_different_unordered_raises(self, other):
# GH-24142
target = pd.Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
with pytest.raises(ValueError):
target[mask] = other[mask]
@pytest.mark.parametrize(
"other",
[
pd.Categorical(["b", "a"]),
pd.Categorical(["b", "a"], categories=["b", "a"], ordered=True),
pd.Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True),
],
)
def test_setitem_same_ordered_rasies(self, other):
# Gh-24142
target = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
with pytest.raises(ValueError):
target[mask] = other[mask]
class TestCategoricalIndexing:
def test_getitem_listlike(self):
# GH 9469
# properly coerce the input indexers
np.random.seed(1)
c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8))
result = c.codes[np.array([100000]).astype(np.int64)]
expected = c[np.array([100000]).astype(np.int64)].codes
tm.assert_numpy_array_equal(result, expected)
def test_periodindex(self):
idx1 = PeriodIndex(
["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M"
)
cat1 = Categorical(idx1)
str(cat1)
exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat1._codes, exp_arr)
tm.assert_index_equal(cat1.categories, exp_idx)
idx2 = PeriodIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M"
)
cat2 = Categorical(idx2, ordered=True)
str(cat2)
exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat2._codes, exp_arr)
tm.assert_index_equal(cat2.categories, exp_idx2)
idx3 = PeriodIndex(
[
"2013-12",
"2013-11",
"2013-10",
"2013-09",
"2013-08",
"2013-07",
"2013-05",
],
freq="M",
)
cat3 = Categorical(idx3, ordered=True)
exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
exp_idx = PeriodIndex(
[
"2013-05",
"2013-07",
"2013-08",
"2013-09",
"2013-10",
"2013-11",
"2013-12",
],
freq="M",
)
tm.assert_numpy_array_equal(cat3._codes, exp_arr)
tm.assert_index_equal(cat3.categories, exp_idx)
def test_categories_assigments(self):
s = Categorical(["a", "b", "c", "a"])
exp = np.array([1, 2, 3, 1], dtype=np.int64)
s.categories = [1, 2, 3]
tm.assert_numpy_array_equal(s.__array__(), exp)
tm.assert_index_equal(s.categories, Index([1, 2, 3]))
# lengthen
with pytest.raises(ValueError):
s.categories = [1, 2, 3, 4]
# shorten
with pytest.raises(ValueError):
s.categories = [1, 2]
# Combinations of sorted/unique:
@pytest.mark.parametrize(
"idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]]
)
# Combinations of missing/unique
@pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
@pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
# GH 21448
key = key_class(key_values, categories=range(1, 5))
# Test for flat index and CategoricalIndex with same/different cats:
for dtype in None, "category", key.dtype:
idx = Index(idx_values, dtype=dtype)
expected, exp_miss = idx.get_indexer_non_unique(key_values)
result, res_miss = idx.get_indexer_non_unique(key)
tm.assert_numpy_array_equal(expected, result)
tm.assert_numpy_array_equal(exp_miss, res_miss)
def test_where_unobserved_nan(self):
ser = pd.Series(pd.Categorical(["a", "b"]))
result = ser.where([True, False])
expected = pd.Series(pd.Categorical(["a", None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
# all NA
ser = pd.Series(pd.Categorical(["a", "b"]))
result = ser.where([False, False])
expected = pd.Series(pd.Categorical([None, None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
def test_where_unobserved_categories(self):
ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
result = ser.where([True, True, False], other="b")
expected = pd.Series(
Categorical(["a", "b", "b"], categories=ser.cat.categories)
)
tm.assert_series_equal(result, expected)
def test_where_other_categorical(self):
ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"])
result = ser.where([True, False, True], other)
expected = pd.Series(Categorical(["a", "c", "c"], dtype=ser.dtype))
tm.assert_series_equal(result, expected)
def test_where_warns(self):
ser = pd.Series(Categorical(["a", "b", "c"]))
with tm.assert_produces_warning(FutureWarning):
result = ser.where([True, False, True], "d")
expected = pd.Series(np.array(["a", "d", "c"], dtype="object"))
tm.assert_series_equal(result, expected)
def test_where_ordered_differs_rasies(self):
ser = pd.Series(
Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True)
)
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
with tm.assert_produces_warning(FutureWarning):
result = ser.where([True, False, True], other)
expected = pd.Series(np.array(["a", "c", "c"], dtype=object))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean(index):
s = Series(range(3))
idx = Categorical([True, False, True])
if index:
idx = CategoricalIndex(idx)
assert com.is_bool_indexer(idx)
result = s[idx]
expected = s[idx.astype("object")]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean_raises(index):
s = Series(range(3))
idx = Categorical([True, False, None])
if index:
idx = CategoricalIndex(idx)
with pytest.raises(ValueError, match="NA / NaN"):
s[idx]
@pytest.fixture
def non_coercible_categorical(monkeypatch):
"""
Monkeypatch Categorical.__array__ to ensure no implicit conversion.
Raises
------
ValueError
When Categorical.__array__ is called.
"""
# TODO(Categorical): identify other places where this may be
# useful and move to a conftest.py
def array(self, dtype=None):
raise ValueError("I cannot be converted.")
with monkeypatch.context() as m:
m.setattr(Categorical, "__array__", array)
yield
def test_series_at(non_coercible_categorical):
arr = Categorical(["a", "b", "c"])
ser = Series(arr)
result = ser.at[0]
assert result == "a"

View File

@@ -0,0 +1,82 @@
import collections
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import Categorical, Index, isna
import pandas.util.testing as tm
class TestCategoricalMissing:
def test_na_flags_int_categories(self):
# #1457
categories = list(range(10))
labels = np.random.randint(0, 10, 20)
labels[::5] = -1
cat = Categorical(labels, categories, fastpath=True)
repr(cat)
tm.assert_numpy_array_equal(isna(cat), labels == -1)
def test_nan_handling(self):
# Nans are represented as -1 in codes
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
def test_set_dtype_nans(self):
c = Categorical(["a", "b", np.nan])
result = c._set_dtype(CategoricalDtype(["a", "c"]))
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
def test_set_item_nan(self):
cat = Categorical([1, 2, 3])
cat[1] = np.nan
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)
@pytest.mark.parametrize(
"fillna_kwargs, msg",
[
(
dict(value=1, method="ffill"),
"Cannot specify both 'value' and 'method'.",
),
(dict(), "Must specify a fill 'value' or 'method'."),
(dict(method="bad"), "Invalid fill method. Expecting .* bad"),
],
)
def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/19682
cat = Categorical([1, 2, 3])
with pytest.raises(ValueError, match=msg):
cat.fillna(**fillna_kwargs)
@pytest.mark.parametrize("named", [True, False])
def test_fillna_iterable_category(self, named):
# https://github.com/pandas-dev/pandas/issues/21097
if named:
Point = collections.namedtuple("Point", "x y")
else:
Point = lambda *args: args # tuple
cat = Categorical([Point(0, 0), Point(0, 1), None])
result = cat.fillna(Point(0, 0))
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
tm.assert_categorical_equal(result, expected)

View File

@@ -0,0 +1,431 @@
import operator
import warnings
import numpy as np
import pytest
import pandas as pd
from pandas import Categorical, DataFrame, Series, date_range
from pandas.tests.arrays.categorical.common import TestCategorical
import pandas.util.testing as tm
class TestCategoricalOpsWithFactor(TestCategorical):
def test_categories_none_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(factor, self.factor)
def test_comparisons(self):
result = self.factor[self.factor == "a"]
expected = self.factor[np.asarray(self.factor) == "a"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor != "a"]
expected = self.factor[np.asarray(self.factor) != "a"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor < "c"]
expected = self.factor[np.asarray(self.factor) < "c"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor > "a"]
expected = self.factor[np.asarray(self.factor) > "a"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor >= "b"]
expected = self.factor[np.asarray(self.factor) >= "b"]
tm.assert_categorical_equal(result, expected)
result = self.factor[self.factor <= "b"]
expected = self.factor[np.asarray(self.factor) <= "b"]
tm.assert_categorical_equal(result, expected)
n = len(self.factor)
other = self.factor[np.random.permutation(n)]
result = self.factor == other
expected = np.asarray(self.factor) == np.asarray(other)
tm.assert_numpy_array_equal(result, expected)
result = self.factor == "d"
expected = np.repeat(False, len(self.factor))
tm.assert_numpy_array_equal(result, expected)
# comparisons with categoricals
cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
cat_rev_base = Categorical(
["b", "b", "b"], categories=["c", "b", "a"], ordered=True
)
cat = Categorical(["a", "b", "c"], ordered=True)
cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = np.array([True, False, False])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = np.array([False, False, True])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res = cat > cat_base
exp = np.array([False, False, True])
tm.assert_numpy_array_equal(res, exp)
# Only categories with same categories can be compared
with pytest.raises(TypeError):
cat > cat_rev
cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"])
with pytest.raises(TypeError):
cat_rev > cat_rev_base2
# Only categories with same ordering information can be compared
cat_unorderd = cat.set_ordered(False)
assert not (cat > cat).any()
with pytest.raises(TypeError):
cat > cat_unorderd
# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"])
msg = (
"Cannot compare a Categorical for op __gt__ with type"
r" <class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"])
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
# Make sure that unequal comparison take the categories order in
# account
cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True)
exp = np.array([True, False, False])
res = cat_rev > "b"
tm.assert_numpy_array_equal(res, exp)
# check that zero-dim array gets unboxed
res = cat_rev > np.array("b")
tm.assert_numpy_array_equal(res, exp)
class TestCategoricalOps:
def test_compare_frame(self):
# GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
data = ["a", "b", 2, "a"]
cat = Categorical(data)
df = DataFrame(cat)
for op in [
operator.eq,
operator.ne,
operator.ge,
operator.gt,
operator.le,
operator.lt,
]:
with pytest.raises(ValueError):
# alignment raises unless we transpose
op(cat, df)
result = cat == df.T
expected = DataFrame([[True, True, True, True]])
tm.assert_frame_equal(result, expected)
result = cat[::-1] != df.T
expected = DataFrame([[False, True, True, False]])
tm.assert_frame_equal(result, expected)
def test_datetime_categorical_comparison(self):
dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True]))
def test_reflected_comparison_with_scalars(self):
# GH8658
cat = Categorical([1, 2, 3], ordered=True)
tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True]))
def test_comparison_with_unknown_scalars(self):
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
# and following comparisons with scalars not in categories should raise
# for unequal comps, but not for equal/not equal
cat = Categorical([1, 2, 3], ordered=True)
msg = (
"Cannot compare a Categorical for op __{}__ with a scalar,"
" which is not a category"
)
with pytest.raises(TypeError, match=msg.format("lt")):
cat < 4
with pytest.raises(TypeError, match=msg.format("gt")):
cat > 4
with pytest.raises(TypeError, match=msg.format("gt")):
4 < cat
with pytest.raises(TypeError, match=msg.format("lt")):
4 > cat
tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))
def test_comparison_of_ordered_categorical_with_nan_to_scalar(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# BUG: fix ordered categorical comparison with missing values (#26504 )
# and following comparisons with scalars in categories with missing
# values should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
scalar = 2
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar)
actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
tm.assert_numpy_array_equal(actual, expected)
def test_comparison_of_ordered_categorical_with_nan_to_listlike(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# and following comparisons of missing values in ordered Categorical
# with listlike should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
actual = getattr(cat, compare_operators_no_eq_ne)(other)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.parametrize(
"data,reverse,base",
[(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
)
def test_comparisons(self, data, reverse, base):
cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True))
cat = Series(Categorical(data, ordered=True))
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base)
a = np.array(base)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = Series([True, False, False])
tm.assert_series_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = Series([False, False, True])
tm.assert_series_equal(res_rev, exp_rev)
res = cat > cat_base
exp = Series([False, False, True])
tm.assert_series_equal(res, exp)
scalar = base[1]
res = cat > scalar
exp = Series([False, False, True])
exp2 = cat.values > scalar
tm.assert_series_equal(res, exp)
tm.assert_numpy_array_equal(res.values, exp2)
res_rev = cat_rev > scalar
exp_rev = Series([True, False, False])
exp_rev2 = cat_rev.values > scalar
tm.assert_series_equal(res_rev, exp_rev)
tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
# Only categories with same categories can be compared
with pytest.raises(TypeError):
cat > cat_rev
# categorical cannot be compared to Series or numpy array, and also
# not the other way around
msg = (
"Cannot compare a Categorical for op __gt__ with type"
r" <class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
with pytest.raises(TypeError, match=msg):
a < cat
with pytest.raises(TypeError, match=msg):
a < cat_rev
@pytest.mark.parametrize(
"ctor",
[
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
],
)
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
assert (c1 == c2).all()
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
def test_unordered_different_categories_raises(self):
c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)
with pytest.raises(TypeError, match=("Categoricals can only be compared")):
c1 == c2
def test_compare_different_lengths(self):
c1 = Categorical([], categories=["a", "b"])
c2 = Categorical([], categories=["a"])
msg = "Categories are different lengths"
with pytest.raises(TypeError, match=msg):
c1 == c2
def test_compare_unordered_different_order(self):
# https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
# 349290078
a = pd.Categorical(["a"], categories=["a", "b"])
b = pd.Categorical(["b"], categories=["b", "a"])
assert not a.equals(b)
def test_numeric_like_ops(self):
df = DataFrame({"value": np.random.randint(0, 10000, 100)})
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
# numeric ops should not succeed
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = r"Series cannot perform the operation {}".format(str_rep)
with pytest.raises(TypeError, match=msg):
getattr(df, op)(df)
# reduction ops should not succeed (unless specifically defined, e.g.
# min/max)
s = df["value_group"]
for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
msg = "Categorical cannot perform the operation {}".format(op)
with pytest.raises(TypeError, match=msg):
getattr(s, op)(numeric_only=False)
# mad technically works because it takes always the numeric data
# numpy ops
s = Series(Categorical([1, 2, 3, 4]))
with pytest.raises(TypeError):
np.sum(s)
# numeric ops on a Series
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = r"Series cannot perform the operation {}".format(str_rep)
with pytest.raises(TypeError, match=msg):
getattr(s, op)(2)
# invalid ufunc
with pytest.raises(TypeError):
np.log(s)
def test_contains(self):
# GH21508
c = pd.Categorical(list("aabbca"), categories=list("cab"))
assert "b" in c
assert "z" not in c
assert np.nan not in c
with pytest.raises(TypeError):
assert [1] in c
# assert codes NOT in index
assert 0 not in c
assert 1 not in c
c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab"))
assert np.nan in c
@pytest.mark.parametrize(
"item, expected",
[
(pd.Interval(0, 1), True),
(1.5, True),
(pd.Interval(0.5, 1.5), False),
("a", False),
(pd.Timestamp(1), False),
(pd.Timedelta(1), False),
],
ids=str,
)
def test_contains_interval(self, item, expected):
# GH 23705
cat = Categorical(pd.IntervalIndex.from_breaks(range(3)))
result = item in cat
assert result is expected
def test_contains_list(self):
# GH#21729
cat = Categorical([1, 2, 3])
assert "a" not in cat
with pytest.raises(TypeError, match="unhashable type"):
["a"] in cat
with pytest.raises(TypeError, match="unhashable type"):
["a", "b"] in cat

View File

@@ -0,0 +1,527 @@
import numpy as np
from pandas import (
Categorical,
CategoricalIndex,
Series,
date_range,
option_context,
period_range,
timedelta_range,
)
from pandas.tests.arrays.categorical.common import TestCategorical
class TestCategoricalReprWithFactor(TestCategorical):
def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"]
expected = "\n".join(expected)
actual = repr(self.factor)
assert actual == expected
class TestCategoricalRepr:
def test_big_print(self):
factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True)
expected = [
"[a, b, c, a, b, ..., b, c, a, b, c]",
"Length: 600",
"Categories (3, object): [a, b, c]",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
expected = "[], Categories (3, object): [a, b, c]"
actual = repr(factor)
assert actual == expected
assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
expected = "[], Categories (3, object): [a < b < c]"
actual = repr(factor)
assert expected == actual
factor = Categorical([], [])
expected = "[], Categories (0, object): []"
assert expected == repr(factor)
def test_print_none_width(self):
# GH10087
a = Series(Categorical([1, 2, 3, 4]))
exp = (
"0 1\n1 2\n2 3\n3 4\n"
"dtype: category\nCategories (4, int64): [1, 2, 3, 4]"
)
with option_context("display.width", None):
assert exp == repr(a)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
Length: 60
Categories (3, object): [aaaaa, bb, cccc]"""
assert repr(c) == expected
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
assert repr(c) == expected
# unicode option should not affect to Categorical, as it doesn't care
# the repr width
with option_context("display.unicode.east_asian_width", True):
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
assert repr(c) == expected
def test_categorical_repr(self):
c = Categorical([1, 2, 3])
exp = """[1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1, 2, 3, 4, 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20))
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
assert repr(c) == exp
def test_categorical_repr_ordered(self):
c = Categorical([1, 2, 3], ordered=True)
exp = """[1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20), ordered=True)
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
assert repr(c) == exp
def test_categorical_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx)
# TODO(wesm): exceeding 80 characters in the console is not good
# behavior
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
""
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
)
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
"2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
"2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
def test_categorical_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa
assert repr(c) == exp
def test_categorical_repr_int_with_nan(self):
c = Categorical([1, 2, np.nan])
c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
assert repr(c) == c_exp
s = Series([1, 2, np.nan], dtype="object").astype("category")
s_exp = """0 1\n1 2\n2 NaN
dtype: category
Categories (2, int64): [1, 2]"""
assert repr(s) == s_exp
def test_categorical_repr_period(self):
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa
assert repr(c) == exp
def test_categorical_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa
assert repr(c) == exp
def test_categorical_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa
assert repr(c) == exp
def test_categorical_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa
assert repr(c) == exp
def test_categorical_index_repr(self):
idx = CategoricalIndex(Categorical([1, 2, 3]))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa
assert repr(idx) == exp
i = CategoricalIndex(Categorical(np.arange(10)))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_ordered(self):
i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
i = CategoricalIndex(Categorical(np.arange(10), ordered=True))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
'2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
'2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_period(self):
# test all length
idx = period_range("2011-01-01 09:00", freq="H", periods=1)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="H", periods=2)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="H", periods=3)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx)))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
'2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
'2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="H", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" # noqa
assert repr(i) == exp
def test_categorical_index_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa
assert repr(i) == exp

View File

@@ -0,0 +1,124 @@
import numpy as np
import pytest
from pandas import Categorical, Index
import pandas.util.testing as tm
class TestCategoricalSort:
def test_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(
c.argsort(ascending=True), expected, check_dtype=False
)
expected = expected[::-1]
tm.assert_numpy_array_equal(
c.argsort(ascending=False), expected, check_dtype=False
)
def test_numpy_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False)
tm.assert_numpy_array_equal(
np.argsort(c, kind="mergesort"), expected, check_dtype=False
)
msg = "the 'axis' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, axis=0)
msg = "the 'order' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, order="C")
def test_sort_values(self):
# unordered cats are sortable
cat = Categorical(["a", "b", "b", "a"], ordered=False)
cat.sort_values()
cat = Categorical(["a", "c", "b", "d"], ordered=True)
# sort_values
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
cat = Categorical(
["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
res = cat.sort_values(ascending=False)
exp = np.array(["d", "c", "b", "a"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# sort (inplace order)
cat1 = cat.copy()
cat1.sort_values(inplace=True)
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(cat1.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
def test_sort_values_na_position(self):
# see gh-12882
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
exp_categories = Index([2, 5])
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values() # default arguments
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
res = cat.sort_values(ascending=True, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
res = cat.sort_values(ascending=False, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values(ascending=True, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
res = cat.sort_values(ascending=False, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="last")
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="first")
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)

View File

@@ -0,0 +1,22 @@
from pandas import Categorical
import pandas.util.testing as tm
class TestCategoricalSubclassing:
def test_constructor(self):
sc = tm.SubclassedCategorical(["a", "b", "c"])
assert isinstance(sc, tm.SubclassedCategorical)
tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"]))
def test_from_codes(self):
sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"])
assert isinstance(sc, tm.SubclassedCategorical)
exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"])
tm.assert_categorical_equal(sc, exp)
def test_map(self):
sc = tm.SubclassedCategorical(["a", "b", "c"])
res = sc.map(lambda x: x.upper())
assert isinstance(res, tm.SubclassedCategorical)
exp = Categorical(["A", "B", "C"])
tm.assert_categorical_equal(res, exp)

View File

@@ -0,0 +1,29 @@
import pytest
import pandas as pd
import pandas.util.testing as tm
class TestCategoricalWarnings:
def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; c = Categorical([])"
ip.run_code(code)
with tm.assert_produces_warning(None):
with provisionalcompleter("ignore"):
list(ip.Completer.completions("c.", 1))
def test_CategoricalAccessor_categorical_deprecation(self):
with tm.assert_produces_warning(FutureWarning):
pd.Series(["a", "b"], dtype="category").cat.categorical
def test_CategoricalAccessor_name_deprecation(self):
with tm.assert_produces_warning(FutureWarning):
pd.Series(["a", "b"], dtype="category").cat.name
def test_CategoricalAccessor_index_deprecation(self):
with tm.assert_produces_warning(FutureWarning):
pd.Series(["a", "b"], dtype="category").cat.index

View File

@@ -0,0 +1,101 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Index,
Interval,
IntervalIndex,
Timedelta,
Timestamp,
date_range,
timedelta_range,
)
from pandas.core.arrays import IntervalArray
import pandas.util.testing as tm
@pytest.fixture(
params=[
(Index([0, 2, 4]), Index([1, 3, 5])),
(Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])),
(timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)),
(date_range("20170101", periods=3), date_range("20170102", periods=3)),
(
date_range("20170101", periods=3, tz="US/Eastern"),
date_range("20170102", periods=3, tz="US/Eastern"),
),
],
ids=lambda x: str(x[0].dtype),
)
def left_right_dtypes(request):
"""
Fixture for building an IntervalArray from various dtypes
"""
return request.param
class TestAttributes:
@pytest.mark.parametrize(
"left, right",
[
(0, 1),
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timestamp("2018-01-02")),
pytest.param(
Timestamp("2018-01-01", tz="US/Eastern"),
Timestamp("2018-01-02", tz="US/Eastern"),
marks=pytest.mark.xfail(strict=True, reason="GH 27011"),
),
],
)
@pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex])
def test_is_empty(self, constructor, left, right, closed):
# GH27219
tuples = [(left, left), (left, right), np.nan]
expected = np.array([closed != "both", False, False])
result = constructor.from_tuples(tuples, closed=closed).is_empty
tm.assert_numpy_array_equal(result, expected)
class TestMethods:
@pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"])
def test_set_closed(self, closed, new_closed):
# GH 21670
array = IntervalArray.from_breaks(range(10), closed=closed)
result = array.set_closed(new_closed)
expected = IntervalArray.from_breaks(range(10), closed=new_closed)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[
Interval(0, 1, closed="right"),
IntervalArray.from_breaks([1, 2, 3, 4], closed="right"),
],
)
def test_where_raises(self, other):
ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left"))
match = "'value.closed' is 'right', expected 'left'."
with pytest.raises(ValueError, match=match):
ser.where([True, False, True], other=other)
class TestSetitem:
def test_set_na(self, left_right_dtypes):
left, right = left_right_dtypes
result = IntervalArray.from_arrays(left, right)
result[0] = np.nan
expected_left = Index([left._na_value] + list(left[1:]))
expected_right = Index([right._na_value] + list(right[1:]))
expected = IntervalArray.from_arrays(expected_left, expected_right)
tm.assert_extension_array_equal(result, expected)
def test_repr_matches():
idx = IntervalIndex.from_breaks([1, 2, 3])
a = repr(idx)
b = repr(idx.values)
assert a.replace("Index", "Array") == b

View File

@@ -0,0 +1,90 @@
"""Tests for Interval-Interval operations, such as overlaps, contains, etc."""
import numpy as np
import pytest
from pandas import Interval, IntervalIndex, Timedelta, Timestamp
from pandas.core.arrays import IntervalArray
import pandas.util.testing as tm
@pytest.fixture(params=[IntervalArray, IntervalIndex])
def constructor(request):
"""
Fixture for testing both interval container classes.
"""
return request.param
@pytest.fixture(
params=[
(Timedelta("0 days"), Timedelta("1 day")),
(Timestamp("2018-01-01"), Timedelta("1 day")),
(0, 1),
],
ids=lambda x: type(x[0]).__name__,
)
def start_shift(request):
"""
Fixture for generating intervals of different types from a start value
and a shift value that can be added to start to generate an endpoint.
"""
return request.param
class TestOverlaps:
def test_overlaps_interval(self, constructor, start_shift, closed, other_closed):
start, shift = start_shift
interval = Interval(start, start + 3 * shift, other_closed)
# intervals: identical, nested, spanning, partial, adjacent, disjoint
tuples = [
(start, start + 3 * shift),
(start + shift, start + 2 * shift),
(start - shift, start + 4 * shift),
(start + 2 * shift, start + 4 * shift),
(start + 3 * shift, start + 4 * shift),
(start + 4 * shift, start + 5 * shift),
]
interval_container = constructor.from_tuples(tuples, closed)
adjacent = interval.closed_right and interval_container.closed_left
expected = np.array([True, True, True, True, adjacent, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex])
def test_overlaps_interval_container(self, constructor, other_constructor):
# TODO: modify this test when implemented
interval_container = constructor.from_breaks(range(5))
other_container = other_constructor.from_breaks(range(5))
with pytest.raises(NotImplementedError):
interval_container.overlaps(other_container)
def test_overlaps_na(self, constructor, start_shift):
"""NA values are marked as False"""
start, shift = start_shift
interval = Interval(start, start + shift)
tuples = [
(start, start + shift),
np.nan,
(start + 2 * shift, start + 3 * shift),
]
interval_container = constructor.from_tuples(tuples)
expected = np.array([True, False, False])
result = interval_container.overlaps(interval)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"other",
[10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")],
ids=lambda x: type(x).__name__,
)
def test_overlaps_invalid_type(self, constructor, other):
interval_container = constructor.from_breaks(range(5))
msg = "`other` must be Interval-like, got {other}".format(
other=type(other).__name__
)
with pytest.raises(TypeError, match=msg):
interval_container.overlaps(other)

View File

@@ -0,0 +1,119 @@
import string
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas.util.testing as tm
class TestSeriesAccessor:
# TODO: collect other Series accessor tests
def test_to_dense(self):
s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]")
result = s.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)
class TestFrameAccessor:
def test_accessor_raises(self):
df = pd.DataFrame({"A": [0, 1]})
with pytest.raises(AttributeError, match="sparse"):
df.sparse
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
@pytest.mark.parametrize("dtype", ["float64", "int64"])
@td.skip_if_no_scipy
def test_from_spmatrix(self, format, labels, dtype):
import scipy.sparse
sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item())
mat = scipy.sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
expected = pd.DataFrame(
np.eye(10, dtype=dtype), index=labels, columns=labels
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns",
[["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]],
)
@td.skip_if_no_scipy
def test_from_spmatrix_columns(self, columns):
import scipy.sparse
dtype = pd.SparseDtype("float64", 0.0)
mat = scipy.sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
tm.assert_frame_equal(result, expected)
@td.skip_if_no_scipy
def test_to_coo(self):
import scipy.sparse
df = pd.DataFrame({"A": [0, 1, 0], "B": [1, 0, 0]}, dtype="Sparse[int64, 0]")
result = df.sparse.to_coo()
expected = scipy.sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0
def test_to_dense(self):
df = pd.DataFrame(
{
"A": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 0)),
"B": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 1)),
"C": pd.SparseArray([1.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)),
},
index=["b", "a"],
)
result = df.sparse.to_dense()
expected = pd.DataFrame(
{"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"]
)
tm.assert_frame_equal(result, expected)
def test_density(self):
df = pd.DataFrame(
{
"A": pd.SparseArray([1, 0, 2, 1], fill_value=0),
"B": pd.SparseArray([0, 1, 1, 1], fill_value=0),
}
)
res = df.sparse.density
expected = 0.75
assert res == expected
@pytest.mark.parametrize("dtype", ["int64", "float64"])
@pytest.mark.parametrize("dense_index", [True, False])
@td.skip_if_no_scipy
def test_series_from_coo(self, dtype, dense_index):
import scipy.sparse
A = scipy.sparse.eye(3, format="coo", dtype=dtype)
result = pd.Series.sparse.from_coo(A, dense_index=dense_index)
index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
expected = pd.Series(
pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index
)
if dense_index:
expected = expected.reindex(pd.MultiIndex.from_product(index.levels))
tm.assert_series_equal(result, expected)
@td.skip_if_no_scipy
def test_series_from_coo_incorrect_format_raises(self):
# gh-26554
import scipy.sparse
m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]]))
with pytest.raises(
TypeError, match="Expected coo_matrix. Got csr_matrix instead."
):
pd.Series.sparse.from_coo(m)

View File

@@ -0,0 +1,506 @@
import operator
import numpy as np
import pytest
import pandas as pd
from pandas.core import ops
from pandas.core.sparse.api import SparseDtype
import pandas.util.testing as tm
@pytest.fixture(params=["integer", "block"])
def kind(request):
"""kind kwarg to pass to SparseArray/SparseSeries"""
return request.param
@pytest.fixture(params=[True, False])
def mix(request):
# whether to operate op(sparse, dense) instead of op(sparse, sparse)
return request.param
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning")
class TestSparseArrayArithmetics:
_base = np.array
_klass = pd.SparseArray
def _assert(self, a, b):
tm.assert_numpy_array_equal(a, b)
def _check_numeric_ops(self, a, b, a_dense, b_dense, mix, op):
with np.errstate(invalid="ignore", divide="ignore"):
if op in [operator.floordiv, ops.rfloordiv]:
# FIXME: GH#13843
if self._base == pd.Series and a.dtype.subtype == np.dtype("int64"):
pytest.xfail("Not defined/working. See GH#13843")
if mix:
result = op(a, b_dense).to_dense()
else:
result = op(a, b).to_dense()
if op in [operator.truediv, ops.rtruediv]:
# pandas uses future division
expected = op(a_dense * 1.0, b_dense)
else:
expected = op(a_dense, b_dense)
if op in [operator.floordiv, ops.rfloordiv]:
# Series sets 1//0 to np.inf, which SparseArray does not do (yet)
mask = np.isinf(expected)
if mask.any():
expected[mask] = np.nan
self._assert(result, expected)
def _check_bool_result(self, res):
assert isinstance(res, self._klass)
assert isinstance(res.dtype, SparseDtype)
assert res.dtype.subtype == np.bool
assert isinstance(res.fill_value, bool)
def _check_comparison_ops(self, a, b, a_dense, b_dense):
with np.errstate(invalid="ignore"):
# Unfortunately, trying to wrap the computation of each expected
# value is with np.errstate() is too tedious.
#
# sparse & sparse
self._check_bool_result(a == b)
self._assert((a == b).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b)
self._assert((a != b).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b)
self._assert((a >= b).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b)
self._assert((a <= b).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b)
self._assert((a > b).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b)
self._assert((a < b).to_dense(), a_dense < b_dense)
# sparse & dense
self._check_bool_result(a == b_dense)
self._assert((a == b_dense).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b_dense)
self._assert((a != b_dense).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b_dense)
self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b_dense)
self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b_dense)
self._assert((a > b_dense).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b_dense)
self._assert((a < b_dense).to_dense(), a_dense < b_dense)
def _check_logical_ops(self, a, b, a_dense, b_dense):
# sparse & sparse
self._check_bool_result(a & b)
self._assert((a & b).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b)
self._assert((a | b).to_dense(), a_dense | b_dense)
# sparse & dense
self._check_bool_result(a & b_dense)
self._assert((a & b_dense).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b_dense)
self._assert((a | b_dense).to_dense(), a_dense | b_dense)
@pytest.mark.parametrize("scalar", [0, 1, 3])
@pytest.mark.parametrize("fill_value", [None, 0, 2])
def test_float_scalar(
self, kind, mix, all_arithmetic_functions, fill_value, scalar
):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind, fill_value=fill_value)
self._check_numeric_ops(a, scalar, values, scalar, mix, op)
def test_float_scalar_comparison(self, kind):
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = self._klass(values, kind=kind)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = self._klass(values, kind=kind, fill_value=0)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = self._klass(values, kind=kind, fill_value=2)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
def test_float_same_index(self, kind, mix, all_arithmetic_functions):
# when sp_index are the same
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_comparison(self, kind):
# when sp_index are the same
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
def test_float_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_different_kind(self, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind="integer")
b = self._klass(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind="integer", fill_value=0)
b = self._klass(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind="integer", fill_value=0)
b = self._klass(rvalues, kind="block", fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind="integer", fill_value=1)
b = self._klass(rvalues, kind="block", fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_comparison(self, kind):
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_int_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
# have to specify dtype explicitly until fixing GH 667
dtype = np.int64
values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = self._klass(values, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, fill_value=1, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype, fill_value=1)
b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_int_array_comparison(self, kind):
dtype = "int64"
# int32 NI ATM
values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = self._klass(values, dtype=dtype, kind=kind)
b = self._klass(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
b = self._klass(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=0)
b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, dtype=dtype, kind=kind, fill_value=1)
b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_same_index(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = self._base([True, False, True, True], dtype=np.bool)
rvalues = self._base([True, False, True, True], dtype=np.bool)
a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value)
b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_array_logical(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = self._base([True, False, True, False, True, True], dtype=np.bool)
rvalues = self._base([True, False, False, True, False, True], dtype=np.bool)
a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value)
b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
rdtype = "int64"
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_mixed_array_comparison(self, kind):
rdtype = "int64"
# int32 NI ATM
values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = self._klass(values, kind=kind)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=0)
b = self._klass(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = self._klass(values, kind=kind, fill_value=1)
b = self._klass(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
class TestSparseSeriesArithmetic(TestSparseArrayArithmetics):
_base = pd.Series
_klass = pd.SparseSeries
def _assert(self, a, b):
tm.assert_series_equal(a, b)
def test_alignment(self, mix, all_arithmetic_functions):
op = all_arithmetic_functions
da = pd.Series(np.arange(4))
db = pd.Series(np.arange(4), index=[1, 2, 3, 4])
sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0)
sb = pd.SparseSeries(
np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=0
)
self._check_numeric_ops(sa, sb, da, db, mix, op)
sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan)
sb = pd.SparseSeries(
np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=np.nan
)
self._check_numeric_ops(sa, sb, da, db, mix, op)
da = pd.Series(np.arange(4))
db = pd.Series(np.arange(4), index=[10, 11, 12, 13])
sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0)
sb = pd.SparseSeries(
np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=0
)
self._check_numeric_ops(sa, sb, da, db, mix, op)
sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan)
sb = pd.SparseSeries(
np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=np.nan
)
self._check_numeric_ops(sa, sb, da, db, mix, op)
@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
arr = pd.SparseArray([0, 1], fill_value=0)
result = op(arr, [0, 1])
expected = op(arr, pd.SparseArray([0, 1]))
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.abs, np.exp])
@pytest.mark.parametrize(
"arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])]
)
def test_ufuncs(ufunc, arr):
result = ufunc(arr)
fill_value = ufunc(arr.fill_value)
expected = pd.SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
(pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])),
(pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
],
)
@pytest.mark.parametrize("ufunc", [np.add, np.greater])
def test_binary_ufuncs(ufunc, a, b):
# can't say anything about fill value here.
result = ufunc(a, b)
expected = ufunc(np.asarray(a), np.asarray(b))
assert isinstance(result, pd.SparseArray)
tm.assert_numpy_array_equal(np.asarray(result), expected)
def test_ndarray_inplace():
sparray = pd.SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
ndarray += sparray
expected = np.array([0, 3, 2, 3])
tm.assert_numpy_array_equal(ndarray, expected)
def test_sparray_inplace():
sparray = pd.SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
sparray += ndarray
expected = pd.SparseArray([0, 3, 2, 3], fill_value=0)
tm.assert_sp_array_equal(sparray, expected)
@pytest.mark.parametrize("fill_value", [True, False])
def test_invert(fill_value):
arr = np.array([True, False, False, True])
sparray = pd.SparseArray(arr, fill_value=fill_value)
result = ~sparray
expected = pd.SparseArray(~arr, fill_value=not fill_value)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("fill_value", [0, np.nan])
@pytest.mark.parametrize("op", [operator.pos, operator.neg])
def test_unary_op(op, fill_value):
arr = np.array([0, 1, np.nan, 2])
sparray = pd.SparseArray(arr, fill_value=fill_value)
result = op(sparray)
expected = pd.SparseArray(op(arr), fill_value=op(fill_value))
tm.assert_sp_array_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,183 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.sparse.api import SparseDtype
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", 0),
("float", np.nan),
("bool", False),
("object", np.nan),
("datetime64[ns]", pd.NaT),
("timedelta64[ns]", pd.NaT),
],
)
def test_inferred_dtype(dtype, fill_value):
sparse_dtype = SparseDtype(dtype)
result = sparse_dtype.fill_value
if pd.isna(fill_value):
assert pd.isna(result) and type(result) == type(fill_value)
else:
assert result == fill_value
def test_from_sparse_dtype():
dtype = SparseDtype("float", 0)
result = SparseDtype(dtype)
assert result.fill_value == 0
def test_from_sparse_dtype_fill_value():
dtype = SparseDtype("int", 1)
result = SparseDtype(dtype, fill_value=2)
expected = SparseDtype("int", 2)
assert result == expected
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", None),
("float", None),
("bool", None),
("object", None),
("datetime64[ns]", None),
("timedelta64[ns]", None),
("int", np.nan),
("float", 0),
],
)
def test_equal(dtype, fill_value):
a = SparseDtype(dtype, fill_value)
b = SparseDtype(dtype, fill_value)
assert a == b
assert b == a
def test_nans_equal():
a = SparseDtype(float, float("nan"))
b = SparseDtype(float, np.nan)
assert a == b
assert b == a
@pytest.mark.parametrize(
"a, b",
[
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
],
)
def test_not_equal(a, b):
assert a != b
def test_construct_from_string_raises():
with pytest.raises(TypeError):
SparseDtype.construct_from_string("not a dtype")
@pytest.mark.parametrize(
"dtype, expected",
[
(SparseDtype(int), True),
(SparseDtype(float), True),
(SparseDtype(bool), True),
(SparseDtype(object), False),
(SparseDtype(str), False),
],
)
def test_is_numeric(dtype, expected):
assert dtype._is_numeric is expected
def test_str_uses_object():
result = SparseDtype(str).subtype
assert result == np.dtype("object")
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[float64]", SparseDtype(np.dtype("float64"))),
("Sparse[float32]", SparseDtype(np.dtype("float32"))),
("Sparse[int]", SparseDtype(np.dtype("int"))),
("Sparse[str]", SparseDtype(np.dtype("str"))),
("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))),
("Sparse", SparseDtype(np.dtype("float"), np.nan)),
],
)
def test_construct_from_string(string, expected):
result = SparseDtype.construct_from_string(string)
assert result == expected
@pytest.mark.parametrize(
"a, b, expected",
[
(SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True),
(SparseDtype(int, 0), SparseDtype(int, 0), True),
(SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True),
(SparseDtype(float, 0), SparseDtype(float, np.nan), False),
(SparseDtype(int, 0.0), SparseDtype(float, 0.0), False),
],
)
def test_hash_equal(a, b, expected):
result = a == b
assert result is expected
result = hash(a) == hash(b)
assert result is expected
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[int]", "int"),
("Sparse[int, 0]", "int"),
("Sparse[int64]", "int64"),
("Sparse[int64, 0]", "int64"),
("Sparse[datetime64[ns], 0]", "datetime64[ns]"),
],
)
def test_parse_subtype(string, expected):
subtype, _ = SparseDtype._parse_subtype(string)
assert subtype == expected
@pytest.mark.parametrize(
"string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"]
)
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match="fill_value in the string is not"):
SparseDtype.construct_from_string(string)
@pytest.mark.parametrize(
"original, dtype, expected",
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, "1")),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected
@pytest.mark.parametrize(
"original, dtype",
[(SparseDtype(float, np.nan), int), (SparseDtype(str, "abc"), int)],
)
def test_update_dtype_raises(original, dtype):
with pytest.raises(ValueError):
original.update_dtype(dtype)

View File

@@ -0,0 +1,601 @@
import operator
import numpy as np
import pytest
import pandas._libs.sparse as splib
import pandas.util._test_decorators as td
from pandas import Series
from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index
import pandas.util.testing as tm
TEST_LENGTH = 20
plain_case = dict(
xloc=[0, 7, 15],
xlen=[3, 5, 5],
yloc=[2, 9, 14],
ylen=[2, 3, 5],
intersect_loc=[2, 9, 15],
intersect_len=[1, 3, 4],
)
delete_blocks = dict(
xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], intersect_loc=[1], intersect_len=[3]
)
split_blocks = dict(
xloc=[0],
xlen=[10],
yloc=[0, 5],
ylen=[3, 7],
intersect_loc=[0, 5],
intersect_len=[3, 5],
)
skip_block = dict(
xloc=[10],
xlen=[5],
yloc=[0, 12],
ylen=[5, 3],
intersect_loc=[12],
intersect_len=[3],
)
no_intersect = dict(
xloc=[0, 10],
xlen=[4, 6],
yloc=[5, 17],
ylen=[4, 2],
intersect_loc=[],
intersect_len=[],
)
def check_cases(_check_case):
def _check_case_dict(case):
_check_case(
case["xloc"],
case["xlen"],
case["yloc"],
case["ylen"],
case["intersect_loc"],
case["intersect_len"],
)
_check_case_dict(plain_case)
_check_case_dict(delete_blocks)
_check_case_dict(split_blocks)
_check_case_dict(skip_block)
_check_case_dict(no_intersect)
# one or both is empty
_check_case([0], [5], [], [], [], [])
_check_case([], [], [], [], [], [])
class TestSparseIndexUnion:
def test_index_make_union(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
bresult = xindex.make_union(yindex)
assert isinstance(bresult, BlockIndex)
tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
tm.assert_numpy_array_equal(
bresult.blengths, np.array(elen, dtype=np.int32)
)
ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert isinstance(iresult, IntIndex)
tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
"""
x: ----
y: ----
r: --------
"""
xloc = [0]
xlen = [5]
yloc = [5]
ylen = [4]
eloc = [0]
elen = [9]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----- -----
y: ----- --
"""
xloc = [0, 10]
xlen = [5, 5]
yloc = [2, 17]
ylen = [5, 2]
eloc = [0, 10, 17]
elen = [7, 5, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------
y: -------
r: ----------
"""
xloc = [1]
xlen = [5]
yloc = [3]
ylen = [5]
eloc = [1]
elen = [7]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: -------
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4]
ylen = [8]
eloc = [2]
elen = [12]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: --- -----
y: -------
r: -------------
"""
xloc = [0, 5]
xlen = [3, 5]
yloc = [0]
ylen = [7]
eloc = [0]
elen = [10]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ------ -----
y: ------- ---
r: -------------
"""
xloc = [2, 10]
xlen = [4, 4]
yloc = [4, 13]
ylen = [8, 4]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ----------------------
y: ---- ---- ---
r: ----------------------
"""
xloc = [2]
xlen = [15]
yloc = [4, 9, 14]
ylen = [3, 2, 2]
eloc = [2]
elen = [15]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
"""
x: ---- ---
y: --- ---
"""
xloc = [0, 10]
xlen = [3, 3]
yloc = [5, 15]
ylen = [2, 2]
eloc = [0, 5, 10, 15]
elen = [3, 2, 3, 2]
_check_case(xloc, xlen, yloc, ylen, eloc, elen)
def test_int_index_make_union(self):
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1], dtype=np.int32))
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
msg = "Indices must reference same underlying length"
with pytest.raises(ValueError, match=msg):
a.make_union(b)
class TestSparseIndexIntersect:
@td.skip_if_windows
def test_intersect(self):
def _check_correct(a, b, expected):
result = a.intersect(b)
assert result.equals(expected)
def _check_length_exc(a, longer):
msg = "Indices must reference same underlying length"
with pytest.raises(Exception, match=msg):
a.intersect(longer)
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
expected = BlockIndex(TEST_LENGTH, eloc, elen)
longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen)
_check_correct(xindex, yindex, expected)
_check_correct(
xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index()
)
_check_length_exc(xindex, longer_index)
_check_length_exc(xindex.to_int_index(), longer_index.to_int_index())
check_cases(_check_case)
def test_intersect_empty(self):
xindex = IntIndex(4, np.array([], dtype=np.int32))
yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
xindex = xindex.to_block_index()
yindex = yindex.to_block_index()
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
def test_intersect_identical(self):
cases = [
IntIndex(5, np.array([1, 2], dtype=np.int32)),
IntIndex(5, np.array([0, 2, 4], dtype=np.int32)),
IntIndex(0, np.array([], dtype=np.int32)),
IntIndex(5, np.array([], dtype=np.int32)),
]
for case in cases:
assert case.intersect(case).equals(case)
case = case.to_block_index()
assert case.intersect(case).equals(case)
class TestSparseIndexCommon:
def test_int_internal(self):
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = _make_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_block_internal(self):
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = _make_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
def test_lookup(self):
for kind in ["integer", "block"]:
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == -1
assert idx.lookup(1) == -1
assert idx.lookup(2) == 0
assert idx.lookup(3) == 1
assert idx.lookup(4) == -1
idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
for i in range(-1, 5):
assert idx.lookup(i) == -1
idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == 1
assert idx.lookup(2) == 2
assert idx.lookup(3) == 3
assert idx.lookup(4) == -1
idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == -1
assert idx.lookup(2) == 1
assert idx.lookup(3) == 2
assert idx.lookup(4) == -1
def test_lookup_array(self):
for kind in ["integer", "block"]:
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, -1, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 0, -1, 1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
exp = np.array([-1, -1, -1, -1], dtype=np.int32)
idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, 0, 2], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 2, 1, 3], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
exp = np.array([1, -1, 2, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
exp = np.array([-1, -1, 1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
def test_lookup_basics(self):
def _check(index):
assert index.lookup(0) == -1
assert index.lookup(5) == 0
assert index.lookup(7) == 2
assert index.lookup(8) == -1
assert index.lookup(9) == -1
assert index.lookup(10) == -1
assert index.lookup(11) == -1
assert index.lookup(12) == 3
assert index.lookup(17) == 8
assert index.lookup(18) == -1
bindex = BlockIndex(20, [5, 12], [3, 6])
iindex = bindex.to_int_index()
_check(bindex)
_check(iindex)
# corner cases
class TestBlockIndex:
def test_block_internal(self):
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = _make_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
def test_make_block_boundary(self):
for i in [5, 10, 100, 101]:
idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
exp = np.arange(0, i, 2, dtype=np.int32)
tm.assert_numpy_array_equal(idx.blocs, exp)
tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
def test_equals(self):
index = BlockIndex(10, [0, 4], [2, 5])
assert index.equals(index)
assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
def test_check_integrity(self):
locs = []
lengths = []
# 0-length OK
# TODO: index variables are not used...is that right?
index = BlockIndex(0, locs, lengths) # noqa
# also OK even though empty
index = BlockIndex(1, locs, lengths) # noqa
msg = "Block 0 extends beyond end"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [5], [10])
msg = "Block 0 overlaps"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [2, 5], [5, 3])
def test_to_int_index(self):
locs = [0, 10]
lengths = [4, 6]
exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
block = BlockIndex(20, locs, lengths)
dense = block.to_int_index()
tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
def test_to_block_index(self):
index = BlockIndex(10, [0, 5], [4, 5])
assert index.to_block_index() is index
class TestIntIndex:
def test_check_integrity(self):
# Too many indices than specified in self.length
msg = "Too many indices"
with pytest.raises(ValueError, match=msg):
IntIndex(length=1, indices=[1, 2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# All indices must be less than the length.
msg = "All indices must be less than the length"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 5])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 6])
# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 2])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 3])
def test_int_internal(self):
idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = _make_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_equals(self):
index = IntIndex(10, [0, 1, 2, 3, 4])
assert index.equals(index)
assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
def test_to_block_index(self):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
# see if survive the round trip
xbindex = xindex.to_int_index().to_block_index()
ybindex = yindex.to_int_index().to_block_index()
assert isinstance(xbindex, BlockIndex)
assert xbindex.equals(xindex)
assert ybindex.equals(yindex)
check_cases(_check_case)
def test_to_int_index(self):
index = IntIndex(10, [2, 3, 4, 5, 6])
assert index.to_int_index() is index
class TestSparseOperators:
def _op_tests(self, sparse_op, python_op):
def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
xdindex = xindex.to_int_index()
ydindex = yindex.to_int_index()
x = np.arange(xindex.npoints) * 10.0 + 1
y = np.arange(yindex.npoints) * 100.0 + 1
xfill = 0
yfill = 2
result_block_vals, rb_index, bfill = sparse_op(
x, xindex, xfill, y, yindex, yfill
)
result_int_vals, ri_index, ifill = sparse_op(
x, xdindex, xfill, y, ydindex, yfill
)
assert rb_index.to_int_index().equals(ri_index)
tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
assert bfill == ifill
# check versus Series...
xseries = Series(x, xdindex.indices)
xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill)
yseries = Series(y, ydindex.indices)
yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill)
series_result = python_op(xseries, yseries)
series_result = series_result.reindex(ri_index.indices)
tm.assert_numpy_array_equal(result_block_vals, series_result.values)
tm.assert_numpy_array_equal(result_int_vals, series_result.values)
check_cases(_check_case)
@pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
def test_op(self, opname):
sparse_op = getattr(splib, "sparse_{opname}_float64".format(opname=opname))
python_op = getattr(operator, opname)
self._op_tests(sparse_op, python_op)

View File

@@ -0,0 +1,360 @@
import datetime
import decimal
import numpy as np
import pytest
import pytz
from pandas.core.dtypes.dtypes import registry
import pandas as pd
from pandas.api.extensions import register_extension_dtype
from pandas.api.types import is_scalar
from pandas.core.arrays import PandasArray, integer_array, period_array
from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal
import pandas.util.testing as tm
@pytest.mark.parametrize(
"data, dtype, expected",
[
# Basic NumPy defaults.
([1, 2], None, PandasArray(np.array([1, 2]))),
([1, 2], object, PandasArray(np.array([1, 2], dtype=object))),
(
[1, 2],
np.dtype("float32"),
PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
),
(np.array([1, 2]), None, PandasArray(np.array([1, 2]))),
# String alias passes through to NumPy
([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),
# Period alias
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
"Period[D]",
period_array(["2000", "2001"], freq="D"),
),
# Period dtype
(
[pd.Period("2000", "D")],
pd.PeriodDtype("D"),
period_array(["2000"], freq="D"),
),
# Datetime (naive)
(
[1, 2],
np.dtype("datetime64[ns]"),
pd.arrays.DatetimeArray._from_sequence(
np.array([1, 2], dtype="datetime64[ns]")
),
),
(
np.array([1, 2], dtype="datetime64[ns]"),
None,
pd.arrays.DatetimeArray._from_sequence(
np.array([1, 2], dtype="datetime64[ns]")
),
),
(
pd.DatetimeIndex(["2000", "2001"]),
np.dtype("datetime64[ns]"),
pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]),
),
(
pd.DatetimeIndex(["2000", "2001"]),
None,
pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]),
),
(
["2000", "2001"],
np.dtype("datetime64[ns]"),
pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]),
),
# Datetime (tz-aware)
(
["2000", "2001"],
pd.DatetimeTZDtype(tz="CET"),
pd.arrays.DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
# Timedelta
(
["1H", "2H"],
np.dtype("timedelta64[ns]"),
pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
pd.TimedeltaIndex(["1H", "2H"]),
np.dtype("timedelta64[ns]"),
pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
pd.TimedeltaIndex(["1H", "2H"]),
None,
pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]),
),
# Category
(["a", "b"], "category", pd.Categorical(["a", "b"])),
(
["a", "b"],
pd.CategoricalDtype(None, ordered=True),
pd.Categorical(["a", "b"], ordered=True),
),
# Interval
(
[pd.Interval(1, 2), pd.Interval(3, 4)],
"interval",
pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]),
),
# Sparse
([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")),
# IntegerNA
([1, None], "Int16", integer_array([1, None], dtype="Int16")),
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# Index
(pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# Series[EA] returns the EA
(
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
None,
pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
),
# "3rd party" EAs work
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
# pass an ExtensionArray, but a different dtype
(
period_array(["2000", "2001"], freq="D"),
"category",
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
),
],
)
def test_array(data, dtype, expected):
result = pd.array(data, dtype=dtype)
tm.assert_equal(result, expected)
def test_array_copy():
a = np.array([1, 2])
# default is to copy
b = pd.array(a)
assert np.shares_memory(a, b._ndarray) is False
# copy=True
b = pd.array(a, copy=True)
assert np.shares_memory(a, b._ndarray) is False
# copy=False
b = pd.array(a, copy=False)
assert np.shares_memory(a, b._ndarray) is True
cet = pytz.timezone("CET")
@pytest.mark.parametrize(
"data, expected",
[
# period
(
[pd.Period("2000", "D"), pd.Period("2001", "D")],
period_array(["2000", "2001"], freq="D"),
),
# interval
(
[pd.Interval(0, 1), pd.Interval(1, 2)],
pd.arrays.IntervalArray.from_breaks([0, 1, 2]),
),
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]),
),
(
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]),
),
(
np.array([1, 2], dtype="M8[ns]"),
pd.arrays.DatetimeArray(np.array([1, 2], dtype="M8[ns]")),
),
(
np.array([1, 2], dtype="M8[us]"),
pd.arrays.DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")),
),
# datetimetz
(
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
pd.arrays.DatetimeArray._from_sequence(
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
),
),
(
[
datetime.datetime(2000, 1, 1, tzinfo=cet),
datetime.datetime(2001, 1, 1, tzinfo=cet),
],
pd.arrays.DatetimeArray._from_sequence(["2000", "2001"], tz=cet),
),
# timedelta
(
[pd.Timedelta("1H"), pd.Timedelta("2H")],
pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]),
),
(
np.array([1, 2], dtype="m8[ns]"),
pd.arrays.TimedeltaArray(np.array([1, 2], dtype="m8[ns]")),
),
(
np.array([1, 2], dtype="m8[us]"),
pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")),
),
],
)
def test_array_inference(data, expected):
result = pd.array(data)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
# mix of frequencies
[pd.Period("2000", "D"), pd.Period("2001", "A")],
# mix of closed
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
# Mix of timezones
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
# Mix of tz-aware and tz-naive
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
],
)
def test_array_inference_fails(data):
result = pd.array(data)
expected = PandasArray(np.array(data, dtype=object))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]])
def test_nd_raises(data):
with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"):
pd.array(data)
def test_scalar_raises():
with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
pd.array(1)
# ---------------------------------------------------------------------------
# A couple dummy classes to ensure that Series and Indexes are unboxed before
# getting to the EA classes.
@register_extension_dtype
class DecimalDtype2(DecimalDtype):
name = "decimal2"
@classmethod
def construct_array_type(cls):
return DecimalArray2
class DecimalArray2(DecimalArray):
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if isinstance(scalars, (pd.Series, pd.Index)):
raise TypeError
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
@pytest.mark.parametrize("box", [pd.Series, pd.Index])
def test_array_unboxes(box):
data = box([decimal.Decimal("1"), decimal.Decimal("2")])
# make sure it works
with pytest.raises(TypeError):
DecimalArray2._from_sequence(data)
result = pd.array(data, dtype="decimal2")
expected = DecimalArray2._from_sequence(data.values)
tm.assert_equal(result, expected)
@pytest.fixture
def registry_without_decimal():
idx = registry.dtypes.index(DecimalDtype)
registry.dtypes.pop(idx)
yield
registry.dtypes.append(DecimalDtype)
def test_array_not_registered(registry_without_decimal):
# check we aren't on it
assert registry.find("decimal") is None
data = [decimal.Decimal("1"), decimal.Decimal("2")]
result = pd.array(data, dtype=DecimalDtype)
expected = DecimalArray._from_sequence(data)
tm.assert_equal(result, expected)
class TestArrayAnalytics:
def test_searchsorted(self, string_dtype):
arr = pd.array(["a", "b", "c"], dtype=string_dtype)
result = arr.searchsorted("a", side="left")
assert is_scalar(result)
assert result == 0
result = arr.searchsorted("a", side="right")
assert is_scalar(result)
assert result == 1
def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_dtype)
result = arr.searchsorted(30)
assert is_scalar(result)
assert result == 2
result = arr.searchsorted([30])
expected = np.array([2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype):
arr = pd.array([1, 3, 90], dtype=any_real_dtype)
result = arr.searchsorted([2, 30])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"arr, val",
[
[
pd.date_range("20120101", periods=10, freq="2D"),
pd.Timestamp("20120102"),
],
[
pd.date_range("20120101", periods=10, freq="2D", tz="Asia/Hong_Kong"),
pd.Timestamp("20120102", tz="Asia/Hong_Kong"),
],
[
pd.timedelta_range(start="1 day", end="10 days", periods=10),
pd.Timedelta("2 days"),
],
],
)
def test_search_sorted_datetime64_scalar(self, arr, val):
arr = pd.array(arr)
result = arr.searchsorted(val)
assert is_scalar(result)
assert result == 1
def test_searchsorted_sorter(self, any_real_dtype):
arr = pd.array([3, 1, 2], dtype=any_real_dtype)
result = arr.searchsorted([0, 3], sorter=np.argsort(arr))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

View File

@@ -0,0 +1,701 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
import pandas.util.testing as tm
# TODO: more freq variants
@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"])
def period_index(request):
"""
A fixture to provide PeriodIndex objects with different frequencies.
Most PeriodArray behavior is already tested in PeriodIndex tests,
so here we just test that the PeriodArray behavior matches
the PeriodIndex behavior.
"""
freqstr = request.param
# TODO: non-monotone indexes; NaTs, different start dates
pi = pd.period_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr)
return pi
@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"])
def datetime_index(request):
"""
A fixture to provide DatetimeIndex objects with different frequencies.
Most DatetimeArray behavior is already tested in DatetimeIndex tests,
so here we just test that the DatetimeArray behavior matches
the DatetimeIndex behavior.
"""
freqstr = request.param
# TODO: non-monotone indexes; NaTs, different start dates, timezones
pi = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr)
return pi
@pytest.fixture
def timedelta_index(request):
"""
A fixture to provide TimedeltaIndex objects with different frequencies.
Most TimedeltaArray behavior is already tested in TimedeltaIndex tests,
so here we just test that the TimedeltaArray behavior matches
the TimedeltaIndex behavior.
"""
# TODO: flesh this out
return pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])
class SharedTests:
index_cls = None
def test_compare_len1_raises(self):
# make sure we raise when comparing with different lengths, specific
# to the case where one has length-1, which numpy would broadcast
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
idx = self.index_cls._simple_new(data, freq="D")
arr = self.array_cls(idx)
with pytest.raises(ValueError, match="Lengths must match"):
arr == arr[:1]
# test the index classes while we're at it, GH#23078
with pytest.raises(ValueError, match="Lengths must match"):
idx <= idx[[0]]
def test_take(self):
data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9
np.random.shuffle(data)
idx = self.index_cls._simple_new(data, freq="D")
arr = self.array_cls(idx)
takers = [1, 4, 94]
result = arr.take(takers)
expected = idx.take(takers)
tm.assert_index_equal(self.index_cls(result), expected)
takers = np.array([1, 4, 94])
result = arr.take(takers)
expected = idx.take(takers)
tm.assert_index_equal(self.index_cls(result), expected)
def test_take_fill(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
idx = self.index_cls._simple_new(data, freq="D")
arr = self.array_cls(idx)
result = arr.take([-1, 1], allow_fill=True, fill_value=None)
assert result[0] is pd.NaT
result = arr.take([-1, 1], allow_fill=True, fill_value=np.nan)
assert result[0] is pd.NaT
result = arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT)
assert result[0] is pd.NaT
with pytest.raises(ValueError):
arr.take([0, 1], allow_fill=True, fill_value=2)
with pytest.raises(ValueError):
arr.take([0, 1], allow_fill=True, fill_value=2.0)
with pytest.raises(ValueError):
arr.take([0, 1], allow_fill=True, fill_value=pd.Timestamp.now().time)
def test_concat_same_type(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT)
arr = self.array_cls(idx)
result = arr._concat_same_type([arr[:-1], arr[1:], arr])
expected = idx._concat_same_dtype([idx[:-1], idx[1:], idx], None)
tm.assert_index_equal(self.index_cls(result), expected)
def test_unbox_scalar(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
result = arr._unbox_scalar(arr[0])
assert isinstance(result, int)
result = arr._unbox_scalar(pd.NaT)
assert isinstance(result, int)
with pytest.raises(ValueError):
arr._unbox_scalar("foo")
def test_check_compatible_with(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
arr._check_compatible_with(arr[0])
arr._check_compatible_with(arr[:1])
arr._check_compatible_with(pd.NaT)
def test_scalar_from_string(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
result = arr._scalar_from_string(str(arr[0]))
assert result == arr[0]
def test_reduce_invalid(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
with pytest.raises(TypeError, match="cannot perform"):
arr._reduce("not a method")
@pytest.mark.parametrize("method", ["pad", "backfill"])
def test_fillna_method_doesnt_change_orig(self, method):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
arr[4] = pd.NaT
fill_value = arr[3] if method == "pad" else arr[5]
result = arr.fillna(method=method)
assert result[4] == fill_value
# check that the original was not changed
assert arr[4] is pd.NaT
def test_searchsorted(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
# scalar
result = arr.searchsorted(arr[1])
assert result == 1
result = arr.searchsorted(arr[2], side="right")
assert result == 3
# own-type
result = arr.searchsorted(arr[1:3])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
result = arr.searchsorted(arr[1:3], side="right")
expected = np.array([2, 3], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
# Following numpy convention, NaT goes at the beginning
# (unlike NaN which goes at the end)
result = arr.searchsorted(pd.NaT)
assert result == 0
def test_setitem(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
arr[0] = arr[1]
expected = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
expected[0] = expected[1]
tm.assert_numpy_array_equal(arr.asi8, expected)
arr[:2] = arr[-2:]
expected[:2] = expected[-2:]
tm.assert_numpy_array_equal(arr.asi8, expected)
def test_setitem_raises(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
val = arr[0]
with pytest.raises(IndexError, match="index 12 is out of bounds"):
arr[12] = val
with pytest.raises(TypeError, match="'value' should be a.* 'object'"):
arr[0] = object()
class TestDatetimeArray(SharedTests):
index_cls = pd.DatetimeIndex
array_cls = DatetimeArray
def test_round(self, tz_naive_fixture):
# GH#24064
tz = tz_naive_fixture
dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz)
result = dti.round(freq="2T")
expected = dti - pd.Timedelta(minutes=1)
tm.assert_index_equal(result, expected)
def test_array_interface(self, datetime_index):
arr = DatetimeArray(datetime_index)
# default asarray gives the same underlying data (for tz naive)
result = np.asarray(arr)
expected = arr._data
assert result is expected
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, copy=False)
assert result is expected
tm.assert_numpy_array_equal(result, expected)
# specifying M8[ns] gives the same result as default
result = np.asarray(arr, dtype="datetime64[ns]")
expected = arr._data
assert result is expected
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, dtype="datetime64[ns]", copy=False)
assert result is expected
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, dtype="datetime64[ns]")
assert result is not expected
tm.assert_numpy_array_equal(result, expected)
# to object dtype
result = np.asarray(arr, dtype=object)
expected = np.array(list(arr), dtype=object)
tm.assert_numpy_array_equal(result, expected)
# to other dtype always copies
result = np.asarray(arr, dtype="int64")
assert result is not arr.asi8
assert not np.may_share_memory(arr, result)
expected = arr.asi8.copy()
tm.assert_numpy_array_equal(result, expected)
# other dtypes handled by numpy
for dtype in ["float64", str]:
result = np.asarray(arr, dtype=dtype)
expected = np.asarray(arr).astype(dtype)
tm.assert_numpy_array_equal(result, expected)
def test_array_object_dtype(self, tz_naive_fixture):
# GH#23524
tz = tz_naive_fixture
dti = pd.date_range("2016-01-01", periods=3, tz=tz)
arr = DatetimeArray(dti)
expected = np.array(list(dti))
result = np.array(arr, dtype=object)
tm.assert_numpy_array_equal(result, expected)
# also test the DatetimeIndex method while we're at it
result = np.array(dti, dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_array_tz(self, tz_naive_fixture):
# GH#23524
tz = tz_naive_fixture
dti = pd.date_range("2016-01-01", periods=3, tz=tz)
arr = DatetimeArray(dti)
expected = dti.asi8.view("M8[ns]")
result = np.array(arr, dtype="M8[ns]")
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, dtype="datetime64[ns]")
tm.assert_numpy_array_equal(result, expected)
# check that we are not making copies when setting copy=False
result = np.array(arr, dtype="M8[ns]", copy=False)
assert result.base is expected.base
assert result.base is not None
result = np.array(arr, dtype="datetime64[ns]", copy=False)
assert result.base is expected.base
assert result.base is not None
def test_array_i8_dtype(self, tz_naive_fixture):
tz = tz_naive_fixture
dti = pd.date_range("2016-01-01", periods=3, tz=tz)
arr = DatetimeArray(dti)
expected = dti.asi8
result = np.array(arr, dtype="i8")
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)
# check that we are still making copies when setting copy=False
result = np.array(arr, dtype="i8", copy=False)
assert result.base is not expected.base
assert result.base is None
def test_from_array_keeps_base(self):
# Ensure that DatetimeArray._data.base isn't lost.
arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
dta = DatetimeArray(arr)
assert dta._data is arr
dta = DatetimeArray(arr[:0])
assert dta._data.base is arr
def test_from_dti(self, tz_naive_fixture):
tz = tz_naive_fixture
dti = pd.date_range("2016-01-01", periods=3, tz=tz)
arr = DatetimeArray(dti)
assert list(dti) == list(arr)
# Check that Index.__new__ knows what to do with DatetimeArray
dti2 = pd.Index(arr)
assert isinstance(dti2, pd.DatetimeIndex)
assert list(dti2) == list(arr)
def test_astype_object(self, tz_naive_fixture):
tz = tz_naive_fixture
dti = pd.date_range("2016-01-01", periods=3, tz=tz)
arr = DatetimeArray(dti)
asobj = arr.astype("O")
assert isinstance(asobj, np.ndarray)
assert asobj.dtype == "O"
assert list(asobj) == list(dti)
@pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"])
def test_to_perioddelta(self, datetime_index, freqstr):
# GH#23113
dti = datetime_index
arr = DatetimeArray(dti)
expected = dti.to_perioddelta(freq=freqstr)
result = arr.to_perioddelta(freq=freqstr)
assert isinstance(result, TimedeltaArray)
# placeholder until these become actual EA subclasses and we can use
# an EA-specific tm.assert_ function
tm.assert_index_equal(pd.Index(result), pd.Index(expected))
@pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"])
def test_to_period(self, datetime_index, freqstr):
dti = datetime_index
arr = DatetimeArray(dti)
expected = dti.to_period(freq=freqstr)
result = arr.to_period(freq=freqstr)
assert isinstance(result, PeriodArray)
# placeholder until these become actual EA subclasses and we can use
# an EA-specific tm.assert_ function
tm.assert_index_equal(pd.Index(result), pd.Index(expected))
@pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops)
def test_bool_properties(self, datetime_index, propname):
# in this case _bool_ops is just `is_leap_year`
dti = datetime_index
arr = DatetimeArray(dti)
assert dti.freq == arr.freq
result = getattr(arr, propname)
expected = np.array(getattr(dti, propname), dtype=result.dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("propname", pd.DatetimeIndex._field_ops)
def test_int_properties(self, datetime_index, propname):
dti = datetime_index
arr = DatetimeArray(dti)
result = getattr(arr, propname)
expected = np.array(getattr(dti, propname), dtype=result.dtype)
tm.assert_numpy_array_equal(result, expected)
def test_take_fill_valid(self, datetime_index, tz_naive_fixture):
dti = datetime_index.tz_localize(tz_naive_fixture)
arr = DatetimeArray(dti)
now = pd.Timestamp.now().tz_localize(dti.tz)
result = arr.take([-1, 1], allow_fill=True, fill_value=now)
assert result[0] == now
with pytest.raises(ValueError):
# fill_value Timedelta invalid
arr.take([-1, 1], allow_fill=True, fill_value=now - now)
with pytest.raises(ValueError):
# fill_value Period invalid
arr.take([-1, 1], allow_fill=True, fill_value=pd.Period("2014Q1"))
tz = None if dti.tz is not None else "US/Eastern"
now = pd.Timestamp.now().tz_localize(tz)
with pytest.raises(TypeError):
# Timestamp with mismatched tz-awareness
arr.take([-1, 1], allow_fill=True, fill_value=now)
with pytest.raises(ValueError):
# require NaT, not iNaT, as it could be confused with an integer
arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT.value)
def test_concat_same_type_invalid(self, datetime_index):
# different timezones
dti = datetime_index
arr = DatetimeArray(dti)
if arr.tz is None:
other = arr.tz_localize("UTC")
else:
other = arr.tz_localize(None)
with pytest.raises(AssertionError):
arr._concat_same_type([arr, other])
def test_concat_same_type_different_freq(self):
# we *can* concatenate DTI with different freqs.
a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central"))
b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central"))
result = DatetimeArray._concat_same_type([a, b])
expected = DatetimeArray(
pd.to_datetime(
[
"2000-01-01 00:00:00",
"2000-01-02 00:00:00",
"2000-01-01 00:00:00",
"2000-01-01 01:00:00",
]
).tz_localize("US/Central")
)
tm.assert_datetime_array_equal(result, expected)
class TestTimedeltaArray(SharedTests):
index_cls = pd.TimedeltaIndex
array_cls = TimedeltaArray
def test_from_tdi(self):
tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"])
arr = TimedeltaArray(tdi)
assert list(arr) == list(tdi)
# Check that Index.__new__ knows what to do with TimedeltaArray
tdi2 = pd.Index(arr)
assert isinstance(tdi2, pd.TimedeltaIndex)
assert list(tdi2) == list(arr)
def test_astype_object(self):
tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"])
arr = TimedeltaArray(tdi)
asobj = arr.astype("O")
assert isinstance(asobj, np.ndarray)
assert asobj.dtype == "O"
assert list(asobj) == list(tdi)
def test_to_pytimedelta(self, timedelta_index):
tdi = timedelta_index
arr = TimedeltaArray(tdi)
expected = tdi.to_pytimedelta()
result = arr.to_pytimedelta()
tm.assert_numpy_array_equal(result, expected)
def test_total_seconds(self, timedelta_index):
tdi = timedelta_index
arr = TimedeltaArray(tdi)
expected = tdi.total_seconds()
result = arr.total_seconds()
tm.assert_numpy_array_equal(result, expected.values)
@pytest.mark.parametrize("propname", pd.TimedeltaIndex._field_ops)
def test_int_properties(self, timedelta_index, propname):
tdi = timedelta_index
arr = TimedeltaArray(tdi)
result = getattr(arr, propname)
expected = np.array(getattr(tdi, propname), dtype=result.dtype)
tm.assert_numpy_array_equal(result, expected)
def test_array_interface(self, timedelta_index):
arr = TimedeltaArray(timedelta_index)
# default asarray gives the same underlying data
result = np.asarray(arr)
expected = arr._data
assert result is expected
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, copy=False)
assert result is expected
tm.assert_numpy_array_equal(result, expected)
# specifying m8[ns] gives the same result as default
result = np.asarray(arr, dtype="timedelta64[ns]")
expected = arr._data
assert result is expected
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, dtype="timedelta64[ns]", copy=False)
assert result is expected
tm.assert_numpy_array_equal(result, expected)
result = np.array(arr, dtype="timedelta64[ns]")
assert result is not expected
tm.assert_numpy_array_equal(result, expected)
# to object dtype
result = np.asarray(arr, dtype=object)
expected = np.array(list(arr), dtype=object)
tm.assert_numpy_array_equal(result, expected)
# to other dtype always copies
result = np.asarray(arr, dtype="int64")
assert result is not arr.asi8
assert not np.may_share_memory(arr, result)
expected = arr.asi8.copy()
tm.assert_numpy_array_equal(result, expected)
# other dtypes handled by numpy
for dtype in ["float64", str]:
result = np.asarray(arr, dtype=dtype)
expected = np.asarray(arr).astype(dtype)
tm.assert_numpy_array_equal(result, expected)
def test_take_fill_valid(self, timedelta_index):
tdi = timedelta_index
arr = TimedeltaArray(tdi)
td1 = pd.Timedelta(days=1)
result = arr.take([-1, 1], allow_fill=True, fill_value=td1)
assert result[0] == td1
now = pd.Timestamp.now()
with pytest.raises(ValueError):
# fill_value Timestamp invalid
arr.take([0, 1], allow_fill=True, fill_value=now)
with pytest.raises(ValueError):
# fill_value Period invalid
arr.take([0, 1], allow_fill=True, fill_value=now.to_period("D"))
class TestPeriodArray(SharedTests):
index_cls = pd.PeriodIndex
array_cls = PeriodArray
def test_from_pi(self, period_index):
pi = period_index
arr = PeriodArray(pi)
assert list(arr) == list(pi)
# Check that Index.__new__ knows what to do with PeriodArray
pi2 = pd.Index(arr)
assert isinstance(pi2, pd.PeriodIndex)
assert list(pi2) == list(arr)
def test_astype_object(self, period_index):
pi = period_index
arr = PeriodArray(pi)
asobj = arr.astype("O")
assert isinstance(asobj, np.ndarray)
assert asobj.dtype == "O"
assert list(asobj) == list(pi)
@pytest.mark.parametrize("how", ["S", "E"])
def test_to_timestamp(self, how, period_index):
pi = period_index
arr = PeriodArray(pi)
expected = DatetimeArray(pi.to_timestamp(how=how))
result = arr.to_timestamp(how=how)
assert isinstance(result, DatetimeArray)
# placeholder until these become actual EA subclasses and we can use
# an EA-specific tm.assert_ function
tm.assert_index_equal(pd.Index(result), pd.Index(expected))
@pytest.mark.parametrize("propname", PeriodArray._bool_ops)
def test_bool_properties(self, period_index, propname):
# in this case _bool_ops is just `is_leap_year`
pi = period_index
arr = PeriodArray(pi)
result = getattr(arr, propname)
expected = np.array(getattr(pi, propname))
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("propname", PeriodArray._field_ops)
def test_int_properties(self, period_index, propname):
pi = period_index
arr = PeriodArray(pi)
result = getattr(arr, propname)
expected = np.array(getattr(pi, propname))
tm.assert_numpy_array_equal(result, expected)
def test_array_interface(self, period_index):
arr = PeriodArray(period_index)
# default asarray gives objects
result = np.asarray(arr)
expected = np.array(list(arr), dtype=object)
tm.assert_numpy_array_equal(result, expected)
# to object dtype (same as default)
result = np.asarray(arr, dtype=object)
tm.assert_numpy_array_equal(result, expected)
# to other dtypes
with pytest.raises(TypeError):
np.asarray(arr, dtype="int64")
with pytest.raises(TypeError):
np.asarray(arr, dtype="float64")
result = np.asarray(arr, dtype="S20")
expected = np.asarray(arr).astype("S20")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"array,casting_nats",
[
(
pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data,
(pd.NaT, np.timedelta64("NaT", "ns")),
),
(
pd.date_range("2000-01-01", periods=3, freq="D")._data,
(pd.NaT, np.datetime64("NaT", "ns")),
),
(pd.period_range("2000-01-01", periods=3, freq="D")._data, (pd.NaT,)),
],
ids=lambda x: type(x).__name__,
)
def test_casting_nat_setitem_array(array, casting_nats):
expected = type(array)._from_sequence([pd.NaT, array[1], array[2]])
for nat in casting_nats:
arr = array.copy()
arr[0] = nat
tm.assert_equal(arr, expected)
@pytest.mark.parametrize(
"array,non_casting_nats",
[
(
pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data,
(np.datetime64("NaT", "ns"),),
),
(
pd.date_range("2000-01-01", periods=3, freq="D")._data,
(np.timedelta64("NaT", "ns"),),
),
(
pd.period_range("2000-01-01", periods=3, freq="D")._data,
(np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")),
),
],
ids=lambda x: type(x).__name__,
)
def test_invalid_nat_setitem_array(array, non_casting_nats):
for nat in non_casting_nats:
with pytest.raises(TypeError):
array[0] = nat

View File

@@ -0,0 +1,314 @@
"""
Tests for DatetimeArray
"""
import operator
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas.core.arrays import DatetimeArray
from pandas.core.arrays.datetimes import sequence_to_dt64ns
import pandas.util.testing as tm
class TestDatetimeArrayConstructor:
def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
DatetimeArray(arr.reshape(2, 2))
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
DatetimeArray(arr[[0]].squeeze())
def test_freq_validation(self):
# GH#24623 check that invalid instances cannot be created with the
# public constructor
arr = np.arange(5, dtype=np.int64) * 3600 * 10 ** 9
msg = (
"Inferred frequency H from passed values does not "
"conform to passed frequency W-SUN"
)
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, freq="W")
@pytest.mark.parametrize(
"meth",
[
DatetimeArray._from_sequence,
sequence_to_dt64ns,
pd.to_datetime,
pd.DatetimeIndex,
],
)
def test_mixing_naive_tzaware_raises(self, meth):
# GH#24569
arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")])
msg = (
"Cannot mix tz-aware with tz-naive values|"
"Tz-aware datetime.datetime cannot be converted "
"to datetime64 unless utc=True"
)
for obj in [arr, arr[::-1]]:
# check that we raise regardless of whether naive is found
# before aware or vice-versa
with pytest.raises(ValueError, match=msg):
meth(obj)
def test_from_pandas_array(self):
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10 ** 9
result = DatetimeArray._from_sequence(arr, freq="infer")
expected = pd.date_range("1970-01-01", periods=5, freq="H")._data
tm.assert_datetime_array_equal(result, expected)
def test_mismatched_timezone_raises(self):
arr = DatetimeArray(
np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"),
dtype=DatetimeTZDtype(tz="US/Central"),
)
dtype = DatetimeTZDtype(tz="US/Eastern")
with pytest.raises(TypeError, match="Timezone of the array"):
DatetimeArray(arr, dtype=dtype)
def test_non_array_raises(self):
with pytest.raises(ValueError, match="list"):
DatetimeArray([1, 2, 3])
def test_other_type_raises(self):
with pytest.raises(
ValueError, match="The dtype of 'values' is incorrect.*bool"
):
DatetimeArray(np.array([1, 2, 3], dtype="bool"))
def test_incorrect_dtype_raises(self):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
def test_freq_infer_raises(self):
with pytest.raises(ValueError, match="Frequency inference"):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer")
def test_copy(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray(data, copy=False)
assert arr._data is data
arr = DatetimeArray(data, copy=True)
assert arr._data is not data
class TestDatetimeArrayComparisons:
# TODO: merge this into tests/arithmetic/test_datetime64 once it is
# sufficiently robust
def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators):
# arbitrary tz-naive DatetimeIndex
opname = all_compare_operators.strip("_")
op = getattr(operator, opname)
dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None)
arr = DatetimeArray(dti)
assert arr.freq == dti.freq
assert arr.tz == dti.tz
right = dti
expected = np.ones(len(arr), dtype=bool)
if opname in ["ne", "gt", "lt"]:
# for these the comparisons should be all-False
expected = ~expected
result = op(arr, arr)
tm.assert_numpy_array_equal(result, expected)
for other in [right, np.array(right)]:
# TODO: add list and tuple, and object-dtype once those
# are fixed in the constructor
result = op(arr, other)
tm.assert_numpy_array_equal(result, expected)
result = op(other, arr)
tm.assert_numpy_array_equal(result, expected)
class TestDatetimeArray:
def test_astype_to_same(self):
arr = DatetimeArray._from_sequence(["2000"], tz="US/Central")
result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False)
assert result is arr
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(self, dtype):
arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")])
result = arr.astype(dtype)
if np.dtype(dtype).kind == "u":
expected_dtype = np.dtype("uint64")
else:
expected_dtype = np.dtype("int64")
expected = arr.astype(expected_dtype)
assert result.dtype == expected_dtype
tm.assert_numpy_array_equal(result, expected)
def test_tz_setter_raises(self):
arr = DatetimeArray._from_sequence(["2000"], tz="US/Central")
with pytest.raises(AttributeError, match="tz_localize"):
arr.tz = "UTC"
def test_setitem_different_tz_raises(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central"))
with pytest.raises(ValueError, match="None"):
arr[0] = pd.Timestamp("2000")
with pytest.raises(ValueError, match="US/Central"):
arr[0] = pd.Timestamp("2000", tz="US/Eastern")
def test_setitem_clears_freq(self):
a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central"))
a[0] = pd.Timestamp("2000", tz="US/Central")
assert a.freq is None
def test_repeat_preserves_tz(self):
dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
arr = DatetimeArray(dti)
repeated = arr.repeat([1, 1])
# preserves tz and values, but not freq
expected = DatetimeArray(arr.asi8, freq=None, dtype=arr.dtype)
tm.assert_equal(repeated, expected)
def test_value_counts_preserves_tz(self):
dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
arr = DatetimeArray(dti).repeat([4, 3])
result = arr.value_counts()
# Note: not tm.assert_index_equal, since `freq`s do not match
assert result.index.equals(dti)
arr[-2] = pd.NaT
result = arr.value_counts()
expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["pad", "backfill"])
def test_fillna_preserves_tz(self, method):
dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central")
arr = DatetimeArray(dti, copy=True)
arr[2] = pd.NaT
fill_val = dti[1] if method == "pad" else dti[3]
expected = DatetimeArray._from_sequence(
[dti[0], dti[1], fill_val, dti[3], dti[4]], freq=None, tz="US/Central"
)
result = arr.fillna(method=method)
tm.assert_extension_array_equal(result, expected)
# assert that arr and dti were not modified in-place
assert arr[2] is pd.NaT
assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central")
def test_array_interface_tz(self):
tz = "US/Central"
data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz))
result = np.asarray(data)
expected = np.array(
[
pd.Timestamp("2017-01-01T00:00:00", tz=tz),
pd.Timestamp("2017-01-02T00:00:00", tz=tz),
],
dtype=object,
)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype="M8[ns]")
expected = np.array(
["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]"
)
tm.assert_numpy_array_equal(result, expected)
def test_array_interface(self):
data = DatetimeArray(pd.date_range("2017", periods=2))
expected = np.array(
["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]"
)
result = np.asarray(data)
tm.assert_numpy_array_equal(result, expected)
result = np.asarray(data, dtype=object)
expected = np.array(
[pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")],
dtype=object,
)
tm.assert_numpy_array_equal(result, expected)
class TestSequenceToDT64NS:
def test_tz_dtype_mismatch_raises(self):
arr = DatetimeArray._from_sequence(["2000"], tz="US/Central")
with pytest.raises(TypeError, match="data is already tz-aware"):
sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC"))
def test_tz_dtype_matches(self):
arr = DatetimeArray._from_sequence(["2000"], tz="US/Central")
result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central"))
tm.assert_numpy_array_equal(arr._data, result)
class TestReductions:
@pytest.mark.parametrize("tz", [None, "US/Central"])
def test_min_max(self, tz):
arr = DatetimeArray._from_sequence(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
tz=tz,
)
result = arr.min()
expected = pd.Timestamp("2000-01-02", tz=tz)
assert result == expected
result = arr.max()
expected = pd.Timestamp("2000-01-05", tz=tz)
assert result == expected
result = arr.min(skipna=False)
assert result is pd.NaT
result = arr.max(skipna=False)
assert result is pd.NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna, tz):
arr = DatetimeArray._from_sequence([], tz=tz)
result = arr.min(skipna=skipna)
assert result is pd.NaT
result = arr.max(skipna=skipna)
assert result is pd.NaT

View File

@@ -0,0 +1,816 @@
import numpy as np
import pytest
from pandas.core.dtypes.generic import ABCIndexClass
import pandas as pd
from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar
from pandas.core.arrays import IntegerArray, integer_array
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.tests.extension.base import BaseOpsUtil
import pandas.util.testing as tm
def make_data():
return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100]
@pytest.fixture(
params=[
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
]
)
def dtype(request):
return request.param()
@pytest.fixture
def data(dtype):
return integer_array(make_data(), dtype=dtype)
@pytest.fixture
def data_missing(dtype):
return integer_array([np.nan, 1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing
def test_dtypes(dtype):
# smoke tests on auto dtype construction
if dtype.is_signed_integer:
assert np.dtype(dtype.type).kind == "i"
else:
assert np.dtype(dtype.type).kind == "u"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[
(Int8Dtype(), "Int8Dtype()"),
(Int16Dtype(), "Int16Dtype()"),
(Int32Dtype(), "Int32Dtype()"),
(Int64Dtype(), "Int64Dtype()"),
(UInt8Dtype(), "UInt8Dtype()"),
(UInt16Dtype(), "UInt16Dtype()"),
(UInt32Dtype(), "UInt32Dtype()"),
(UInt64Dtype(), "UInt64Dtype()"),
],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(integer_array([1, None, 3]))
expected = "<IntegerArray>\n[1, NaN, 3]\nLength: 3, dtype: Int64"
assert result == expected
def test_repr_array_long():
data = integer_array([1, 2, None] * 1000)
expected = (
"<IntegerArray>\n"
"[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n"
" ...\n"
" NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n"
"Length: 3000, dtype: Int64"
)
result = repr(data)
assert result == expected
class TestConstructors:
def test_from_dtype_from_float(self, data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(np.array(data).astype("float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from int / array
expected = pd.Series(data).dropna().reset_index(drop=True)
dropped = np.array(data.dropna()).astype(np.dtype((dtype.type)))
result = pd.Series(dropped, dtype=str(dtype))
tm.assert_series_equal(result, expected)
class TestArithmeticOps(BaseOpsUtil):
def _check_divmod_op(self, s, op, other, exc=None):
super()._check_divmod_op(s, op, other, None)
def _check_op(self, s, op_name, other, exc=None):
op = self.get_op_from_name(op_name)
result = op(s, other)
# compute expected
mask = s.isna()
# if s is a DataFrame, squeeze to a Series
# for comparison
if isinstance(s, pd.DataFrame):
result = result.squeeze()
s = s.squeeze()
mask = mask.squeeze()
# other array is an Integer
if isinstance(other, IntegerArray):
omask = getattr(other, "mask", None)
mask = getattr(other, "data", other)
if omask is not None:
mask |= omask
# 1 ** na is na, so need to unmask those
if op_name == "__pow__":
mask = np.where(s == 1, False, mask)
elif op_name == "__rpow__":
mask = np.where(other == 1, False, mask)
# float result type or float op
if (
is_float_dtype(other)
or is_float(other)
or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"]
):
rs = s.astype("float")
expected = op(rs, other)
self._check_op_float(result, expected, mask, s, op_name, other)
# integer result type
else:
rs = pd.Series(s.values._data, name=s.name)
expected = op(rs, other)
self._check_op_integer(result, expected, mask, s, op_name, other)
def _check_op_float(self, result, expected, mask, s, op_name, other):
# check comparisons that are resulting in float dtypes
expected[mask] = np.nan
if "floordiv" in op_name:
# Series op sets 1//0 to np.inf, which IntegerArray does not do (yet)
mask2 = np.isinf(expected) & np.isnan(result)
expected[mask2] = np.nan
tm.assert_series_equal(result, expected)
def _check_op_integer(self, result, expected, mask, s, op_name, other):
# check comparisons that are resulting in integer dtypes
# to compare properly, we convert the expected
# to float, mask to nans and convert infs
# if we have uints then we process as uints
# then conert to float
# and we ultimately want to create a IntArray
# for comparisons
fill_value = 0
# mod/rmod turn floating 0 into NaN while
# integer works as expected (no nan)
if op_name in ["__mod__", "__rmod__"]:
if is_scalar(other):
if other == 0:
expected[s.values == 0] = 0
else:
expected = expected.fillna(0)
else:
expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0
try:
expected[(expected == np.inf) | (expected == -np.inf)] = fill_value
original = expected
expected = expected.astype(s.dtype)
except ValueError:
expected = expected.astype(float)
expected[(expected == np.inf) | (expected == -np.inf)] = fill_value
original = expected
expected = expected.astype(s.dtype)
expected[mask] = np.nan
# assert that the expected astype is ok
# (skip for unsigned as they have wrap around)
if not s.dtype.is_unsigned_integer:
original = pd.Series(original)
# we need to fill with 0's to emulate what an astype('int') does
# (truncation) for certain ops
if op_name in ["__rtruediv__", "__rdiv__"]:
mask |= original.isna()
original = original.fillna(0).astype("int")
original = original.astype("float")
original[mask] = np.nan
tm.assert_series_equal(original, expected.astype("float"))
# assert our expected result
tm.assert_series_equal(result, expected)
def test_arith_integer_array(self, data, all_arithmetic_operators):
# we operate with a rhs of an integer array
op = all_arithmetic_operators
s = pd.Series(data)
rhs = pd.Series([1] * len(data), dtype=data.dtype)
rhs.iloc[-1] = np.nan
self._check_op(s, op, rhs)
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
# scalar
op = all_arithmetic_operators
s = pd.Series(data)
self._check_op(s, op, 1, exc=TypeError)
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
# frame & scalar
op = all_arithmetic_operators
df = pd.DataFrame({"A": data})
self._check_op(df, op, 1, exc=TypeError)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
# ndarray & other series
op = all_arithmetic_operators
s = pd.Series(data)
other = np.ones(len(s), dtype=s.dtype.type)
self._check_op(s, op, other, exc=TypeError)
def test_arith_coerce_scalar(self, data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
other = 0.01
self._check_op(s, op, other)
@pytest.mark.parametrize("other", [1.0, 1.0, np.array(1.0), np.array([1.0])])
def test_arithmetic_conversion(self, all_arithmetic_operators, other):
# if we have a float operand we should have a float result
# if that is equal to an integer
op = self.get_op_from_name(all_arithmetic_operators)
s = pd.Series([1, 2, 3], dtype="Int64")
result = op(s, other)
assert result.dtype is np.dtype("float")
@pytest.mark.parametrize("other", [0, 0.5])
def test_arith_zero_dim_ndarray(self, other):
arr = integer_array([1, None, 2])
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
def test_error(self, data, all_arithmetic_operators):
# invalid ops
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
opa = getattr(data, op)
# invalid scalars
with pytest.raises(TypeError):
ops("foo")
with pytest.raises(TypeError):
ops(pd.Timestamp("20180101"))
# invalid array-likes
with pytest.raises(TypeError):
ops(pd.Series("foo", index=s.index))
if op != "__rpow__":
# TODO(extension)
# rpow with a datetimelike coerces the integer array incorrectly
with pytest.raises(TypeError):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# 2d
with pytest.raises(NotImplementedError):
opa(pd.DataFrame({"A": s}))
with pytest.raises(NotImplementedError):
opa(np.arange(len(s)).reshape(-1, len(s)))
def test_pow(self):
# https://github.com/pandas-dev/pandas/issues/22022
a = integer_array([1, np.nan, np.nan, 1])
b = integer_array([1, np.nan, 1, np.nan])
result = a ** b
expected = pd.core.arrays.integer_array([1, np.nan, np.nan, 1])
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na(self):
# https://github.com/pandas-dev/pandas/issues/22022
arr = integer_array([np.nan, np.nan])
result = np.array([1.0, 2.0]) ** arr
expected = np.array([1.0, np.nan])
tm.assert_numpy_array_equal(result, expected)
class TestComparisonOps(BaseOpsUtil):
def _compare_other(self, data, op_name, other):
op = self.get_op_from_name(op_name)
# array
result = pd.Series(op(data, other))
expected = pd.Series(op(data._data, other))
# fill the nan locations
expected[data._mask] = op_name == "__ne__"
tm.assert_series_equal(result, expected)
# series
s = pd.Series(data)
result = op(s, other)
expected = pd.Series(data._data)
expected = op(expected, other)
# fill the nan locations
expected[data._mask] = op_name == "__ne__"
tm.assert_series_equal(result, expected)
def test_compare_scalar(self, data, all_compare_operators):
op_name = all_compare_operators
self._compare_other(data, op_name, 0)
def test_compare_array(self, data, all_compare_operators):
op_name = all_compare_operators
other = pd.Series([0] * len(data))
self._compare_other(data, op_name, other)
class TestCasting:
pass
@pytest.mark.parametrize("dropna", [True, False])
def test_construct_index(self, all_data, dropna):
# ensure that we do not coerce to Float64Index, rather
# keep as Index
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Index(integer_array(other, dtype=all_data.dtype))
expected = pd.Index(other, dtype=object)
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("dropna", [True, False])
def test_astype_index(self, all_data, dropna):
# as an int/uint index to Index
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
else:
other = all_data
dtype = all_data.dtype
idx = pd.Index(np.array(other))
assert isinstance(idx, ABCIndexClass)
result = idx.astype(dtype)
expected = idx.astype(object).astype(dtype)
tm.assert_index_equal(result, expected)
def test_astype(self, all_data):
all_data = all_data[:10]
ints = all_data[~all_data.isna()]
mixed = all_data
dtype = Int8Dtype()
# coerce to same type - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype)
expected = pd.Series(ints)
tm.assert_series_equal(result, expected)
# coerce to same other - ints
s = pd.Series(ints)
result = s.astype(dtype)
expected = pd.Series(ints, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - ints
s = pd.Series(ints)
result = s.astype(all_data.dtype.numpy_dtype)
expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype))
tm.assert_series_equal(result, expected)
# coerce to same type - mixed
s = pd.Series(mixed)
result = s.astype(all_data.dtype)
expected = pd.Series(mixed)
tm.assert_series_equal(result, expected)
# coerce to same other - mixed
s = pd.Series(mixed)
result = s.astype(dtype)
expected = pd.Series(mixed, dtype=dtype)
tm.assert_series_equal(result, expected)
# coerce to same numpy_dtype - mixed
s = pd.Series(mixed)
with pytest.raises(ValueError):
s.astype(all_data.dtype.numpy_dtype)
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"])
def test_astype_specific_casting(self, dtype):
s = pd.Series([1, 2, 3], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
s = pd.Series([1, 2, 3, None], dtype="Int64")
result = s.astype(dtype)
expected = pd.Series([1, 2, 3, None], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_construct_cast_invalid(self, dtype):
msg = "cannot safely"
arr = [1.2, 2.3, 3.7]
with pytest.raises(TypeError, match=msg):
integer_array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
arr = [1.2, 2.3, 3.7, np.nan]
with pytest.raises(TypeError, match=msg):
integer_array(arr, dtype=dtype)
with pytest.raises(TypeError, match=msg):
pd.Series(arr).astype(dtype)
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 NaN\n1 1"
assert result == expected
def test_conversions(data_missing):
# astype to object series
df = pd.DataFrame({"A": data_missing})
result = df["A"].astype("object")
expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
tm.assert_series_equal(result, expected)
# convert to object ndarray
# we assert that we are exactly equal
# including type conversions of scalars
result = df["A"].astype("object").values
expected = np.array([np.nan, 1], dtype=object)
tm.assert_numpy_array_equal(result, expected)
for r, e in zip(result, expected):
if pd.isnull(r):
assert pd.isnull(e)
elif is_integer(r):
assert r == e
assert is_integer(e)
else:
assert r == e
assert type(r) == type(e)
def test_integer_array_constructor():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
expected = integer_array([1, 2, 3, np.nan], dtype="int64")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError):
IntegerArray(values.tolist(), mask)
with pytest.raises(TypeError):
IntegerArray(values, mask.tolist())
with pytest.raises(TypeError):
IntegerArray(values.astype(float), mask)
with pytest.raises(TypeError):
IntegerArray(values)
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, np.nan]),
([None], [np.nan]),
([None, np.nan], [np.nan, np.nan]),
([np.nan, np.nan], [np.nan, np.nan]),
],
)
def test_integer_array_constructor_none_is_nan(a, b):
result = integer_array(a)
expected = integer_array(b)
tm.assert_extension_array_equal(result, expected)
def test_integer_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="int64")
mask = np.array([False, False, False, True], dtype="bool")
result = IntegerArray(values, mask)
assert result._data is values
assert result._mask is mask
result = IntegerArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
["1", "2"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
],
)
def test_to_integer_array_error(values):
# error in converting existing arrays to IntegerArrays
with pytest.raises(TypeError):
integer_array(values)
def test_to_integer_array_inferred_dtype():
# if values has dtype -> respect it
result = integer_array(np.array([1, 2], dtype="int8"))
assert result.dtype == Int8Dtype()
result = integer_array(np.array([1, 2], dtype="int32"))
assert result.dtype == Int32Dtype()
# if values have no dtype -> always int64
result = integer_array([1, 2])
assert result.dtype == Int64Dtype()
def test_to_integer_array_dtype_keyword():
result = integer_array([1, 2], dtype="int8")
assert result.dtype == Int8Dtype()
# if values has dtype -> override it
result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32")
assert result.dtype == Int32Dtype()
def test_to_integer_array_float():
result = integer_array([1.0, 2.0])
expected = integer_array([1, 2])
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="cannot safely cast non-equivalent"):
integer_array([1.5, 2.0])
# for float dtypes, the itemsize is not preserved
result = integer_array(np.array([1.0, 2.0], dtype="float32"))
assert result.dtype == Int64Dtype()
@pytest.mark.parametrize(
"bool_values, int_values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Int64Dtype(), Int64Dtype()),
([False, True], [0, 1], "Int64", Int64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()),
],
)
def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype):
result = integer_array(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = integer_array(int_values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values, to_dtype, result_dtype",
[
(np.array([1], dtype="int64"), None, Int64Dtype),
(np.array([1, np.nan]), None, Int64Dtype),
(np.array([1, np.nan]), "int8", Int8Dtype),
],
)
def test_to_integer_array(values, to_dtype, result_dtype):
# convert existing arrays to IntegerArrays
result = integer_array(values, dtype=to_dtype)
assert result.dtype == result_dtype()
expected = integer_array(values, dtype=result_dtype())
tm.assert_extension_array_equal(result, expected)
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.Series([1, 2, np.nan], dtype="Int64"),
"B": pd.Series([1, np.nan, 3], dtype="UInt8"),
"C": [1, 2, 3],
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, False])
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Int64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
# TODO(#22346): preserve Int64 dtype
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, int)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", ["mean"])
def test_reduce_to_float(op):
# some reduce ops always return float, even if the result
# is a rounded number
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype="Int64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, float)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
def test_astype_nansafe():
# see gh-22343
arr = integer_array([np.nan, 1, 2], dtype="Int8")
msg = "cannot convert float NaN to integer"
with pytest.raises(ValueError, match=msg):
arr.astype("uint32")
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
def test_ufuncs_single_int(ufunc):
a = integer_array([1, 2, -3, np.nan])
result = ufunc(a)
expected = integer_array(ufunc(a.astype(float)))
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(integer_array(ufunc(a.astype(float))))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = integer_array([1, 2, -3, np.nan])
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = ufunc(a.astype(float))
tm.assert_numpy_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = ufunc(s.astype(float))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_int(ufunc):
# two IntegerArrays
a = integer_array([1, 2, -3, np.nan])
result = ufunc(a, a)
expected = integer_array(ufunc(a.astype(float), a.astype(float)))
tm.assert_extension_array_equal(result, expected)
# IntegerArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = integer_array(ufunc(a.astype(float), arr))
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = integer_array(ufunc(arr, a.astype(float)))
tm.assert_extension_array_equal(result, expected)
# IntegerArray with scalar
result = ufunc(a, 1)
expected = integer_array(ufunc(a.astype(float), 1))
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = integer_array(ufunc(1, a.astype(float)))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
a = integer_array(values)
with pytest.raises(NotImplementedError):
np.add.reduce(a)
# TODO(jreback) - these need testing / are broken
# shift
# set_index (destroys type)

View File

@@ -0,0 +1,213 @@
"""
Additional tests for PandasArray that aren't covered by
the interface tests.
"""
import numpy as np
import pytest
import pandas as pd
from pandas.arrays import PandasArray
from pandas.core.arrays.numpy_ import PandasDtype
import pandas.util.testing as tm
@pytest.fixture(
params=[
np.array(["a", "b"], dtype=object),
np.array([0, 1], dtype=float),
np.array([0, 1], dtype=int),
np.array([0, 1 + 2j], dtype=complex),
np.array([True, False], dtype=bool),
np.array([0, 1], dtype="datetime64[ns]"),
np.array([0, 1], dtype="timedelta64[ns]"),
]
)
def any_numpy_array(request):
"""
Parametrized fixture for NumPy arrays with different dtypes.
This excludes string and bytes.
"""
return request.param
# ----------------------------------------------------------------------------
# PandasDtype
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", True),
("uint", True),
("float", True),
("complex", True),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_numeric(dtype, expected):
dtype = PandasDtype(dtype)
assert dtype._is_numeric is expected
@pytest.mark.parametrize(
"dtype, expected",
[
("bool", True),
("int", False),
("uint", False),
("float", False),
("complex", False),
("str", False),
("bytes", False),
("datetime64[ns]", False),
("object", False),
("void", False),
],
)
def test_is_boolean(dtype, expected):
dtype = PandasDtype(dtype)
assert dtype._is_boolean is expected
def test_repr():
dtype = PandasDtype(np.dtype("int64"))
assert repr(dtype) == "PandasDtype('int64')"
def test_constructor_from_string():
result = PandasDtype.construct_from_string("int64")
expected = PandasDtype(np.dtype("int64"))
assert result == expected
# ----------------------------------------------------------------------------
# Construction
def test_constructor_no_coercion():
with pytest.raises(ValueError, match="NumPy array"):
PandasArray([1, 2, 3])
def test_series_constructor_with_copy():
ndarray = np.array([1, 2, 3])
ser = pd.Series(PandasArray(ndarray), copy=True)
assert ser.values is not ndarray
def test_series_constructor_with_astype():
ndarray = np.array([1, 2, 3])
result = pd.Series(PandasArray(ndarray), dtype="float64")
expected = pd.Series([1.0, 2.0, 3.0], dtype="float64")
tm.assert_series_equal(result, expected)
def test_from_sequence_dtype():
arr = np.array([1, 2, 3], dtype="int64")
result = PandasArray._from_sequence(arr, dtype="uint64")
expected = PandasArray(np.array([1, 2, 3], dtype="uint64"))
tm.assert_extension_array_equal(result, expected)
def test_constructor_copy():
arr = np.array([0, 1])
result = PandasArray(arr, copy=True)
assert np.shares_memory(result._ndarray, arr) is False
def test_constructor_with_data(any_numpy_array):
nparr = any_numpy_array
arr = PandasArray(nparr)
assert arr.dtype.numpy_dtype == nparr.dtype
# ----------------------------------------------------------------------------
# Conversion
def test_to_numpy():
arr = PandasArray(np.array([1, 2, 3]))
result = arr.to_numpy()
assert result is arr._ndarray
result = arr.to_numpy(copy=True)
assert result is not arr._ndarray
result = arr.to_numpy(dtype="f8")
expected = np.array([1, 2, 3], dtype="f8")
tm.assert_numpy_array_equal(result, expected)
# ----------------------------------------------------------------------------
# Setitem
def test_setitem_series():
ser = pd.Series([1, 2, 3])
ser.array[0] = 10
expected = pd.Series([10, 2, 3])
tm.assert_series_equal(ser, expected)
def test_setitem(any_numpy_array):
nparr = any_numpy_array
arr = PandasArray(nparr, copy=True)
arr[0] = arr[1]
nparr[0] = nparr[1]
tm.assert_numpy_array_equal(arr.to_numpy(), nparr)
# ----------------------------------------------------------------------------
# Reductions
def test_bad_reduce_raises():
arr = np.array([1, 2, 3], dtype="int64")
arr = PandasArray(arr)
msg = "cannot perform not_a_method with type int"
with pytest.raises(TypeError, match=msg):
arr._reduce(msg)
def test_validate_reduction_keyword_args():
arr = PandasArray(np.array([1, 2, 3]))
msg = "the 'keepdims' parameter is not supported .*all"
with pytest.raises(ValueError, match=msg):
arr.all(keepdims=True)
# ----------------------------------------------------------------------------
# Ops
def test_ufunc():
arr = PandasArray(np.array([-1.0, 0.0, 1.0]))
result = np.abs(arr)
expected = PandasArray(np.abs(arr._ndarray))
tm.assert_extension_array_equal(result, expected)
r1, r2 = np.divmod(arr, np.add(arr, 2))
e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2))
e1 = PandasArray(e1)
e2 = PandasArray(e2)
tm.assert_extension_array_equal(r1, e1)
tm.assert_extension_array_equal(r2, e2)
def test_basic_binop():
# Just a basic smoke test. The EA interface tests exercise this
# more thoroughly.
x = PandasArray(np.array([1, 2, 3]))
result = x + x
expected = PandasArray(np.array([2, 4, 6]))
tm.assert_extension_array_equal(result, expected)

View File

@@ -0,0 +1,325 @@
import numpy as np
import pytest
from pandas._libs.tslibs import iNaT
from pandas._libs.tslibs.period import IncompatibleFrequency
from pandas.core.dtypes.dtypes import PeriodDtype, registry
import pandas as pd
from pandas.core.arrays import PeriodArray, period_array
import pandas.util.testing as tm
# ----------------------------------------------------------------------------
# Dtype
def test_registered():
assert PeriodDtype in registry.dtypes
result = registry.find("Period[D]")
expected = PeriodDtype("D")
assert result == expected
# ----------------------------------------------------------------------------
# period_array
@pytest.mark.parametrize(
"data, freq, expected",
[
([pd.Period("2017", "D")], None, [17167]),
([pd.Period("2017", "D")], "D", [17167]),
([2017], "D", [17167]),
(["2017"], "D", [17167]),
([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]),
([pd.Period("2017", "D"), None], None, [17167, iNaT]),
(pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]),
(pd.date_range("2017", periods=3), None, [17167, 17168, 17169]),
],
)
def test_period_array_ok(data, freq, expected):
result = period_array(data, freq=freq).asi8
expected = np.asarray(expected, dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)
def test_period_array_readonly_object():
# https://github.com/pandas-dev/pandas/issues/25403
pa = period_array([pd.Period("2019-01-01")])
arr = np.asarray(pa, dtype="object")
arr.setflags(write=False)
result = period_array(arr)
tm.assert_period_array_equal(result, pa)
result = pd.Series(arr)
tm.assert_series_equal(result, pd.Series(pa))
result = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, pd.DataFrame({"A": pa}))
def test_from_datetime64_freq_changes():
# https://github.com/pandas-dev/pandas/issues/23438
arr = pd.date_range("2017", periods=3, freq="D")
result = PeriodArray._from_datetime64(arr, freq="M")
expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize(
"data, freq, msg",
[
(
[pd.Period("2017", "D"), pd.Period("2017", "A")],
None,
"Input has different freq",
),
([pd.Period("2017", "D")], "A", "Input has different freq"),
],
)
def test_period_array_raises(data, freq, msg):
with pytest.raises(IncompatibleFrequency, match=msg):
period_array(data, freq)
def test_period_array_non_period_series_raies():
ser = pd.Series([1, 2, 3])
with pytest.raises(TypeError, match="dtype"):
PeriodArray(ser, freq="D")
def test_period_array_freq_mismatch():
arr = period_array(["2000", "2001"], freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, freq="M")
with pytest.raises(IncompatibleFrequency, match="freq"):
PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd())
def test_asi8():
result = period_array(["2000", "2001", None], freq="D").asi8
expected = np.array([10957, 11323, iNaT])
tm.assert_numpy_array_equal(result, expected)
def test_take_raises():
arr = period_array(["2000", "2001"], freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W"))
with pytest.raises(ValueError, match="foo"):
arr.take([0, -1], allow_fill=True, fill_value="foo")
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype(dtype):
# We choose to ignore the sign and size of integers for
# Period/Datetime/Timedelta astype
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(dtype)
if np.dtype(dtype).kind == "u":
expected_dtype = np.dtype("uint64")
else:
expected_dtype = np.dtype("int64")
expected = arr.astype(expected_dtype)
assert result.dtype == expected_dtype
tm.assert_numpy_array_equal(result, expected)
def test_astype_copies():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(np.int64, copy=False)
# Add the `.base`, since we now use `.asi8` which returns a view.
# We could maybe override it in PeriodArray to return ._data directly.
assert result.base is arr._data
result = arr.astype(np.int64, copy=True)
assert result is not arr._data
tm.assert_numpy_array_equal(result, arr._data.view("i8"))
def test_astype_categorical():
arr = period_array(["2000", "2001", "2001", None], freq="D")
result = arr.astype("category")
categories = pd.PeriodIndex(["2000", "2001"], freq="D")
expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories)
tm.assert_categorical_equal(result, expected)
def test_astype_period():
arr = period_array(["2000", "2001", None], freq="D")
result = arr.astype(PeriodDtype("M"))
expected = period_array(["2000", "2001", None], freq="M")
tm.assert_period_array_equal(result, expected)
@pytest.mark.parametrize("other", ["datetime64[ns]", "timedelta64[ns]"])
def test_astype_datetime(other):
arr = period_array(["2000", "2001", None], freq="D")
# slice off the [ns] so that the regex matches.
with pytest.raises(TypeError, match=other[:-4]):
arr.astype(other)
def test_fillna_raises():
arr = period_array(["2000", "2001", "2002"], freq="D")
with pytest.raises(ValueError, match="Length"):
arr.fillna(arr[:2])
def test_fillna_copies():
arr = period_array(["2000", "2001", "2002"], freq="D")
result = arr.fillna(pd.Period("2000", "D"))
assert result is not arr
# ----------------------------------------------------------------------------
# setitem
@pytest.mark.parametrize(
"key, value, expected",
[
([0], pd.Period("2000", "D"), [10957, 1, 2]),
([0], None, [iNaT, 1, 2]),
([0], np.nan, [iNaT, 1, 2]),
([0, 1, 2], pd.Period("2000", "D"), [10957] * 3),
(
[0, 1, 2],
[pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")],
[10957, 11323, 11688],
),
],
)
def test_setitem(key, value, expected):
arr = PeriodArray(np.arange(3), freq="D")
expected = PeriodArray(expected, freq="D")
arr[key] = value
tm.assert_period_array_equal(arr, expected)
def test_setitem_raises_incompatible_freq():
arr = PeriodArray(np.arange(3), freq="D")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr[0] = pd.Period("2000", freq="A")
other = period_array(["2000", "2001"], freq="A")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr[[0, 1]] = other
def test_setitem_raises_length():
arr = PeriodArray(np.arange(3), freq="D")
with pytest.raises(ValueError, match="length"):
arr[[0, 1]] = [pd.Period("2000", freq="D")]
def test_setitem_raises_type():
arr = PeriodArray(np.arange(3), freq="D")
with pytest.raises(TypeError, match="int"):
arr[0] = 1
# ----------------------------------------------------------------------------
# Ops
def test_sub_period():
arr = period_array(["2000", "2001"], freq="D")
other = pd.Period("2000", freq="M")
with pytest.raises(IncompatibleFrequency, match="freq"):
arr - other
# ----------------------------------------------------------------------------
# Methods
@pytest.mark.parametrize(
"other",
[pd.Period("2000", freq="H"), period_array(["2000", "2001", "2000"], freq="H")],
)
def test_where_different_freq_raises(other):
ser = pd.Series(period_array(["2000", "2001", "2002"], freq="D"))
cond = np.array([True, False, True])
with pytest.raises(IncompatibleFrequency, match="freq"):
ser.where(cond, other)
# ----------------------------------------------------------------------------
# Printing
def test_repr_small():
arr = period_array(["2000", "2001"], freq="D")
result = str(arr)
expected = (
"<PeriodArray>\n['2000-01-01', '2001-01-01']\nLength: 2, dtype: period[D]"
)
assert result == expected
def test_repr_large():
arr = period_array(["2000", "2001"] * 500, freq="D")
result = str(arr)
expected = (
"<PeriodArray>\n"
"['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
"'2000-01-01',\n"
" '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
"'2001-01-01',\n"
" ...\n"
" '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
"'2000-01-01',\n"
" '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
"'2001-01-01']\n"
"Length: 1000, dtype: period[D]"
)
assert result == expected
# ----------------------------------------------------------------------------
# Reductions
class TestReductions:
def test_min_max(self):
arr = period_array(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
freq="D",
)
result = arr.min()
expected = pd.Period("2000-01-02", freq="D")
assert result == expected
result = arr.max()
expected = pd.Period("2000-01-05", freq="D")
assert result == expected
result = arr.min(skipna=False)
assert result is pd.NaT
result = arr.max(skipna=False)
assert result is pd.NaT
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna):
arr = period_array([], freq="D")
result = arr.min(skipna=skipna)
assert result is pd.NaT
result = arr.max(skipna=skipna)
assert result is pd.NaT

View File

@@ -0,0 +1,154 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays import TimedeltaArray
import pandas.util.testing as tm
class TestTimedeltaArrayConstructor:
def test_only_1dim_accepted(self):
# GH#25282
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 2-dim
TimedeltaArray(arr.reshape(2, 2))
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
TimedeltaArray(arr[[0]].squeeze())
def test_freq_validation(self):
# ensure that the public constructor cannot create an invalid instance
arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10 ** 9
msg = (
"Inferred frequency None from passed values does not "
"conform to passed frequency D"
)
with pytest.raises(ValueError, match=msg):
TimedeltaArray(arr.view("timedelta64[ns]"), freq="D")
def test_non_array_raises(self):
with pytest.raises(ValueError, match="list"):
TimedeltaArray([1, 2, 3])
def test_other_type_raises(self):
with pytest.raises(ValueError, match="dtype bool cannot be converted"):
TimedeltaArray(np.array([1, 2, 3], dtype="bool"))
def test_incorrect_dtype_raises(self):
# TODO: why TypeError for 'category' but ValueError for i8?
with pytest.raises(
ValueError, match=r"category cannot be converted " r"to timedelta64\[ns\]"
):
TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
with pytest.raises(
ValueError,
match=r"dtype int64 cannot be converted " r"to timedelta64\[ns\]",
):
TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64"))
def test_copy(self):
data = np.array([1, 2, 3], dtype="m8[ns]")
arr = TimedeltaArray(data, copy=False)
assert arr._data is data
arr = TimedeltaArray(data, copy=True)
assert arr._data is not data
assert arr._data.base is not data
class TestTimedeltaArray:
def test_np_sum(self):
# GH#25282
vals = np.arange(5, dtype=np.int64).view("m8[h]").astype("m8[ns]")
arr = TimedeltaArray(vals)
result = np.sum(arr)
assert result == vals.sum()
result = np.sum(pd.TimedeltaIndex(arr))
assert result == vals.sum()
def test_from_sequence_dtype(self):
msg = "dtype .*object.* cannot be converted to timedelta64"
with pytest.raises(ValueError, match=msg):
TimedeltaArray._from_sequence([], dtype=object)
def test_abs(self):
vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]")
arr = TimedeltaArray(vals)
evals = np.array([3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]")
expected = TimedeltaArray(evals)
result = abs(arr)
tm.assert_timedelta_array_equal(result, expected)
def test_neg(self):
vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]")
arr = TimedeltaArray(vals)
evals = np.array([3600 * 10 ** 9, "NaT", -7200 * 10 ** 9], dtype="m8[ns]")
expected = TimedeltaArray(evals)
result = -arr
tm.assert_timedelta_array_equal(result, expected)
def test_neg_freq(self):
tdi = pd.timedelta_range("2 Days", periods=4, freq="H")
arr = TimedeltaArray(tdi, freq=tdi.freq)
expected = TimedeltaArray(-tdi._data, freq=-tdi.freq)
result = -arr
tm.assert_timedelta_array_equal(result, expected)
@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
def test_astype_int(self, dtype):
arr = TimedeltaArray._from_sequence([pd.Timedelta("1H"), pd.Timedelta("2H")])
result = arr.astype(dtype)
if np.dtype(dtype).kind == "u":
expected_dtype = np.dtype("uint64")
else:
expected_dtype = np.dtype("int64")
expected = arr.astype(expected_dtype)
assert result.dtype == expected_dtype
tm.assert_numpy_array_equal(result, expected)
def test_setitem_clears_freq(self):
a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H"))
a[0] = pd.Timedelta("1H")
assert a.freq is None
class TestReductions:
def test_min_max(self):
arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"])
result = arr.min()
expected = pd.Timedelta("2H")
assert result == expected
result = arr.max()
expected = pd.Timedelta("5H")
assert result == expected
result = arr.min(skipna=False)
assert result is pd.NaT
result = arr.max(skipna=False)
assert result is pd.NaT
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna):
arr = TimedeltaArray._from_sequence([])
result = arr.min(skipna=skipna)
assert result is pd.NaT
result = arr.max(skipna=skipna)
assert result is pd.NaT